Corpora statistics 2018-08-17

Basic settings

In [1]:
import os, sys, time
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
grammar_learner_path = module_path + '/src/grammar_learner/'
if grammar_learner_path not in sys.path: sys.path.append(grammar_learner_path)
from utl import UTC
from widgets import corpus_historgams
kwargs = {
    'parse_mode'    :   'lower'     ,
    'left_wall'     :   ''          ,
    'period'        :   False       ,
    'context'       :   2           ,
    'grammar_rules' :   2           ,
    'tmpath'        :   ''          , 
    'verbose'       :   'min'       }
start = time.time()
out_dir = module_path + '/output/corpora-statistics-' + str(UTC())[:10]
print(UTC(), ':: out_dir =', module_path)
#print(UTC(), ':: module_path =', module_path)
2018-08-17 18:52:09 UTC :: out_dir = /home/obaskov/language-learning
In [2]:
corpora = ['data/POC-Turtle', 'data/POC-English-Amb', 'data/CDS-caps-br-text+brent9mos', \
           'bigdata/Gutenberg-Children-Books', 'bigdata/Pubmed']
datasets = ['MST-fixed-manually', 'LG-English', 'R=6-Weight=6:R-mst-weight=+1:R', \
            'R=6-Weight=6:R-mst-weight=+1:R-adagram', 'LG-English-caps']

POC-Turtle

In [3]:
corpus_historgams(module_path, corpora[0], datasets[0], [False,False], **kwargs)
data/POC-Turtle MST-fixed-manually 2018-08-17 18:52:09 UTC :

Number of sentences : 12
Average sentence length : 3
Number of unique parsed words in sentences : 13
Number of unique non-parsed [words] in sentences : 0
Total words count in sentences : 36
Non-parsed [words] count in sentences : 0
Average per-word counts : 3
Number of unique links : 19
Total links count : 24
Average per-link count : 1
Number of unique connectors : 17
Total connectors count : 48
Average connector count : 2.8
Number of unique disjuncts : 16
Total disjuncts count : 36
Average disjuncts count : 2.2
Number of unique seeds : 31
Total seeds count : 36
Average seed count : 1.2
In [4]:
corpus_historgams(module_path, corpora[0], datasets[2], [False,False], **kwargs)
data/POC-Turtle R=6-Weight=6:R-mst-weight=+1:R 2018-08-17 18:52:10 UTC :

Number of sentences : 12
Average sentence length : 3
Number of unique parsed words in sentences : 13
Number of unique non-parsed [words] in sentences : 0
Total words count in sentences : 36
Non-parsed [words] count in sentences : 0
Average per-word counts : 3
Number of unique links : 19
Total links count : 24
Average per-link count : 1
Number of unique connectors : 17
Total connectors count : 48
Average connector count : 2.8
Number of unique disjuncts : 18
Total disjuncts count : 36
Average disjuncts count : 2.0
Number of unique seeds : 31
Total seeds count : 36
Average seed count : 1.2

POC-English-Amb

In [5]:
corpus_historgams(module_path, corpora[1], datasets[0], logscale=[False,True], **kwargs)
data/POC-English-Amb MST-fixed-manually 2018-08-17 18:52:10 UTC :

Number of sentences : 88
Average sentence length : 6
Number of unique parsed words in sentences : 42
Number of unique non-parsed [words] in sentences : 0
Total words count in sentences : 485
Non-parsed [words] count in sentences : 0
Average per-word counts : 12
Number of unique links : 117
Total links count : 397
Average per-link count : 3
Number of unique connectors : 69
Total connectors count : 794
Average connector count : 11.5
Number of unique disjuncts : 143
Total disjuncts count : 485
Average disjuncts count : 3.4
Number of unique seeds : 216
Total seeds count : 485
Average seed count : 2.2
In [6]:
corpus_historgams(module_path, corpora[1], datasets[1], logscale=[False,True], **kwargs)
data/POC-English-Amb LG-English 2018-08-17 18:52:11 UTC :

Number of sentences : 88
Average sentence length : 6
Number of unique parsed words in sentences : 42
Number of unique non-parsed [words] in sentences : 0
Total words count in sentences : 485
Non-parsed [words] count in sentences : 0
Average per-word counts : 12
Number of unique links : 136
Total links count : 459
Average per-link count : 3
Number of unique connectors : 75
Total connectors count : 918
Average connector count : 12.2
Number of unique disjuncts : 167
Total disjuncts count : 485
Average disjuncts count : 2.9
Number of unique seeds : 234
Total seeds count : 485
Average seed count : 2.1
In [7]:
corpus_historgams(module_path, corpora[1], datasets[2], logscale=[False,True], **kwargs)
data/POC-English-Amb R=6-Weight=6:R-mst-weight=+1:R 2018-08-17 18:52:11 UTC :

Number of sentences : 88
Average sentence length : 6
Number of unique parsed words in sentences : 42
Number of unique non-parsed [words] in sentences : 0
Total words count in sentences : 485
Non-parsed [words] count in sentences : 0
Average per-word counts : 12
Number of unique links : 114
Total links count : 394
Average per-link count : 3
Number of unique connectors : 80
Total connectors count : 788
Average connector count : 9.8
Number of unique disjuncts : 148
Total disjuncts count : 482
Average disjuncts count : 3.3
Number of unique seeds : 210
Total seeds count : 482
Average seed count : 2.3

Child Directed Speech

In [8]:
corpus_historgams(module_path, corpora[2], datasets[1], logscale=[True,True], **kwargs)
data/CDS-caps-br-text+brent9mos LG-English 2018-08-17 18:52:12 UTC :

Number of sentences : 38181
Average sentence length : 3
Number of unique parsed words in sentences : 3399
Number of unique non-parsed [words] in sentences : 193
Total words count in sentences : 124185
Non-parsed [words] count in sentences : 11962
Average per-word counts : 37
Number of unique links : 24785
Total links count : 91531
Average per-link count : 4
Number of unique connectors : 5285
Total connectors count : 183062
Average connector count : 34.6
Number of unique disjuncts : 29727
Total disjuncts count : 114294
Average disjuncts count : 3.8
Number of unique seeds : 50190
Total seeds count : 114294
Average seed count : 2.3
In [9]:
corpus_historgams(module_path, corpora[2], datasets[2], logscale=[True,True], **kwargs)
data/CDS-caps-br-text+brent9mos R=6-Weight=6:R-mst-weight=+1:R 2018-08-17 18:52:16 UTC :

Number of sentences : 38176
Average sentence length : 3
Number of unique parsed words in sentences : 3692
Number of unique non-parsed [words] in sentences : 0
Total words count in sentences : 130104
Non-parsed [words] count in sentences : 0
Average per-word counts : 35
Number of unique links : 30876
Total links count : 91337
Average per-link count : 3
Number of unique connectors : 6308
Total connectors count : 182674
Average connector count : 29.0
Number of unique disjuncts : 33147
Total disjuncts count : 120166
Average disjuncts count : 3.6
Number of unique seeds : 57446
Total seeds count : 120166
Average seed count : 2.1

Gutenberg Children books

In [10]:
corpus_historgams(module_path, corpora[3], datasets[1], logscale=[True,True], **kwargs)
bigdata/Gutenberg-Children-Books LG-English 2018-08-17 18:52:19 UTC :

Number of sentences : 207108
Average sentence length : 13
Number of unique parsed words in sentences : 37461
Number of unique non-parsed [words] in sentences : 594
Total words count in sentences : 2590048
Non-parsed [words] count in sentences : 139061
Average per-word counts : 69
Number of unique links : 544374
Total links count : 2418348
Average per-link count : 4
Number of unique connectors : 62244
Total connectors count : 4836696
Average connector count : 77.7
Number of unique disjuncts : 848911
Total disjuncts count : 2515923
Average disjuncts count : 3.0
Number of unique seeds : 1312881
Total seeds count : 2515923
Average seed count : 1.9
In [11]:
corpus_historgams(module_path, corpora[3], datasets[1], logscale=[True,True], **kwargs)
bigdata/Gutenberg-Children-Books LG-English 2018-08-17 18:53:50 UTC :

Number of sentences : 207108
Average sentence length : 13
Number of unique parsed words in sentences : 37461
Number of unique non-parsed [words] in sentences : 594
Total words count in sentences : 2590048
Non-parsed [words] count in sentences : 139061
Average per-word counts : 69
Number of unique links : 544374
Total links count : 2418348
Average per-link count : 4
Number of unique connectors : 62244
Total connectors count : 4836696
Average connector count : 77.7
Number of unique disjuncts : 848911
Total disjuncts count : 2515923
Average disjuncts count : 3.0
Number of unique seeds : 1312881
Total seeds count : 2515923
Average seed count : 1.9

Pubmed

In [12]:
corpus_historgams(module_path, corpora[4], datasets[4], logscale=[True,True], **kwargs)
bigdata/Pubmed LG-English-caps 2018-08-17 18:55:26 UTC :

Number of sentences : 165348
Average sentence length : 17
Number of unique parsed words in sentences : 68807
Number of unique non-parsed [words] in sentences : 535
Total words count in sentences : 2805123
Non-parsed [words] count in sentences : 63888
Average per-word counts : 41
Number of unique links : 710645
Total links count : 2721472
Average per-link count : 4
Number of unique connectors : 108227
Total connectors count : 5442944
Average connector count : 50.3
Number of unique disjuncts : 1063900
Total disjuncts count : 2785992
Average disjuncts count : 2.6
Number of unique seeds : 1630809
Total seeds count : 2785992
Average seed count : 1.7