2018-08-29
¶Updated: Number of unique linked/non-linked words, average and max disjunct length
import os, sys, time
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
grammar_learner_path = module_path + '/src/grammar_learner/'
if grammar_learner_path not in sys.path: sys.path.append(grammar_learner_path)
from utl import UTC
from widgets import corpus_historgams
kwargs = {
'parse_mode' : 'lower' ,
'left_wall' : '' ,
'period' : False ,
'context' : 2 ,
'grammar_rules' : 2 ,
'tmpath' : '' ,
'verbose' : 'min' }
start = time.time()
out_dir = module_path + '/output/corpora-statistics-' + str(UTC())[:10]
print(UTC())
corpora = ['data/POC-Turtle', 'data/POC-English-Amb', \
'data/CDS-caps-br-text+brent9mos', \
'bigdata/Gutenberg-Children-Books', 'bigdata/Pubmed']
datasets = ['MST-fixed-manually', 'LG-English', \
'R=6-Weight=6:R-mst-weight=+1:R', \
'R=6-Weight=6:R-mst-weight=+1:R-adagram', 'LG-English-caps']
corpus_historgams(module_path, corpora[0], datasets[0], [False,False], **kwargs)
corpus_historgams(module_path, corpora[0], datasets[2], [False,False], **kwargs)
corpus_historgams(module_path, corpora[1], datasets[0], logscale=[False,True], **kwargs)
corpus_historgams(module_path, corpora[1], datasets[1], logscale=[False,True], **kwargs)
corpus_historgams(module_path, corpora[1], datasets[2], logscale=[False,True], **kwargs)
corpus_historgams(module_path, corpora[2], datasets[1], logscale=[True,True], **kwargs)
corpus_historgams(module_path, corpora[2], datasets[2], logscale=[True,True], **kwargs)
corpus_historgams(module_path, corpora[3], datasets[1], logscale=[True,True], **kwargs)
corpus_historgams(module_path, corpora[3], datasets[2], logscale=[True,True], **kwargs)
corpus_historgams(module_path, corpora[4], datasets[4], logscale=[True,True], **kwargs)