SingularityNET Unsupervised Language Learning

GitHub.com/SingNET/Language-LearningREADME

Jump start

Basic settings and imports

In [1]:
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.learner import learn_grammar
from src.grammar_learner.utl import UTC
out_dir = module_path + '/output/Grammar_Learner_Tutorial_' + str(UTC())[:10]
kwargs = {'output_grammar': out_dir + '/1st_test'}
print(UTC(), ':: module_path:', module_path, '\nout_dir:', out_dir)
2019-02-22 08:20:48 UTC :: module_path: /home/obaskov/py/language-learning 
out_dir: /home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22

Input: "parses" ???

In [3]:
kwargs['input_parses'] = module_path + '/data/POC-Turtle/MST-fixed-manually'
kwargs
Out[3]:
{'output_grammar': '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test',
 'input_parses': '/home/obaskov/py/language-learning/data/POC-Turtle/MST-fixed-manually'}
In [5]:
with open(kwargs['input_parses'] + '/poc-turtle-parses-gold.txt', 'r') as f: 
    lines = f.read().splitlines()
for line in lines[:]: print(line)
tuna isa fish .
0 ###LEFT-WALL### 1 tuna
1 tuna 2 isa
2 isa 3 fish
3 fish 4 .

herring isa fish .
0 ###LEFT-WALL### 1 herring
1 herring 2 isa
2 isa 3 fish
3 fish 4 .

tuna has fin .
0 ###LEFT-WALL### 1 tuna
1 tuna 2 has
2 has 3 fin
3 fin 4 .

herring has fin .
0 ###LEFT-WALL### 1 herring
1 herring 2 has
2 has 3 fin
3 fin 4 .

parrot isa bird .
0 ###LEFT-WALL### 1 parrot
1 parrot 2 isa
2 isa 3 bird
3 bird 4 .

eagle isa bird .
0 ###LEFT-WALL### 1 eagle
1 eagle 2 isa
2 isa 3 bird
3 bird 4 .

parrot has wing .
0 ###LEFT-WALL### 1 parrot
1 parrot 2 has
2 has 3 wing
3 wing 4 .

eagle has wing .
0 ###LEFT-WALL### 1 eagle
1 eagle 2 has
2 has 3 wing
3 wing 4 .

fin isa extremity .
0 ###LEFT-WALL### 1 fin
1 fin 2 isa
2 isa 3 extremity
3 extremity 4 .

wing isa extremity .
0 ###LEFT-WALL### 1 wing
1 wing 2 isa
2 isa 3 extremity
3 extremity 4 .

fin has scale .
0 ###LEFT-WALL### 1 fin
1 fin 2 has
2 has 3 scale
3 scale 4 .

wing has feather .
0 ###LEFT-WALL### 1 wing
1 wing 2 has
2 has 3 feather
3 feather 4 .

LEARN!

In [6]:
learn_grammar(**kwargs)
Out[6]:
OrderedDict([('start', '2019-02-22 08:22:43 UTC'),
             ('learn_grammar', 'v.0.7.81231'),
             ('project_directory',
              '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test'),
             ('input files',
              ['/home/obaskov/py/language-learning/data/POC-Turtle/MST-fixed-manually/poc-turtle-parses-gold.txt']),
             ('corpus_stats',
              [['Number of sentences    ', 12],
               ['Maximum sentence length', 3],
               ['Average sentence length', 3],
               ['Number of unique words in sentences', 13],
               ['Number of unique parsed words      ', 13],
               ['Number of unique non-parsed [words]', 0],
               ['Number of unique linked words      ', 13],
               ['Number of unique non-linked words  ', 0],
               ['Total  words count in sentences    ', 36],
               ['Parsed words count in sentences    ', 36],
               ['Non-parsed [words] in sentences    ', 0],
               ['Non-linked words (excl.non-parsed) ', 0],
               ['Average word count ', 3],
               ['Unique links number', 19],
               ['Total  links count ', 24],
               ['Average link count ', 1.3],
               ['Average links per linked word', 2],
               ['Unique connectors number', 17],
               ['Total  connectors count ', 48],
               ['Average connector count ', 2.8],
               ['Unique disjuncts number', 16],
               ['Total  disjuncts count ', 36],
               ['Average disjunct count ', 2.2],
               ['Average disjunct length', 1.3],
               ['Maximum disjunct length', 2],
               ['Unique seeds number', 31],
               ['Total  seeds count ', 36],
               ['Average seed count ', 1.2]]),
             ('terms', 'disjuncts'),
             ('parsed_links', 36),
             ('unique_links', 31),
             ('unique_words', 13),
             ('unique_terms', 16),
             ('word-term_pairs', 31),
             ('corpus_stats_file',
              '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/corpus_stats.txt'),
             ('category_learner', 'v.0.7.81231'),
             ('vector_space_dim', 2),
             ('silhouette', 0.8974816894026034),
             ('inertia', 0.004335897519749409),
             ('generalization', 'none: off'),
             ('cat_tree_file',
              '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/4_cat_tree.txt'),
             ('grammar_file',
              '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/dict_4C_2019-02-22_0007.4.0.dict'),
             ('grammar_clusters', 4),
             ('grammar_rules', 4),
             ('finish', '2019-02-22 08:22:44 UTC'),
             ('grammar_learn_time', '00:00:00')])

Response: re = learn_grammar(**kwargs)

In [7]:
re = learn_grammar(**kwargs)
type(re)
Out[7]:
collections.OrderedDict
In [8]:
# from collections import OrderedDict
re.keys()
Out[8]:
odict_keys(['start', 'learn_grammar', 'project_directory', 'input files', 'corpus_stats', 'terms', 'parsed_links', 'unique_links', 'unique_words', 'unique_terms', 'word-term_pairs', 'corpus_stats_file', 'category_learner', 'vector_space_dim', 'silhouette', 'inertia', 'generalization', 'cat_tree_file', 'grammar_file', 'grammar_clusters', 'grammar_rules', 'finish', 'grammar_learn_time'])

Corpus statistics: re['corpus_stats']corpus_stats.txt file

In [9]:
re['corpus_stats']
Out[9]:
[['Number of sentences    ', 12],
 ['Maximum sentence length', 3],
 ['Average sentence length', 3],
 ['Number of unique words in sentences', 13],
 ['Number of unique parsed words      ', 13],
 ['Number of unique non-parsed [words]', 0],
 ['Number of unique linked words      ', 13],
 ['Number of unique non-linked words  ', 0],
 ['Total  words count in sentences    ', 36],
 ['Parsed words count in sentences    ', 36],
 ['Non-parsed [words] in sentences    ', 0],
 ['Non-linked words (excl.non-parsed) ', 0],
 ['Average word count ', 3],
 ['Unique links number', 19],
 ['Total  links count ', 24],
 ['Average link count ', 1.3],
 ['Average links per linked word', 2],
 ['Unique connectors number', 17],
 ['Total  connectors count ', 48],
 ['Average connector count ', 2.8],
 ['Unique disjuncts number', 16],
 ['Total  disjuncts count ', 36],
 ['Average disjunct count ', 2.2],
 ['Average disjunct length', 1.3],
 ['Maximum disjunct length', 2],
 ['Unique seeds number', 31],
 ['Total  seeds count ', 36],
 ['Average seed count ', 1.2]]
In [10]:
re['corpus_stats_file']
Out[10]:
'/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/corpus_stats.txt'
In [11]:
with open(re['corpus_stats_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[:]: print(line)
Number of sentences    	12
Maximum sentence length	3
Average sentence length	3
Number of unique words in sentences	13
Number of unique parsed words      	13
Number of unique non-parsed [words]	0
Number of unique linked words      	13
Number of unique non-linked words  	0
Total  words count in sentences    	36
Parsed words count in sentences    	36
Non-parsed [words] in sentences    	0
Non-linked words (excl.non-parsed) 	0
Average word count 	3
Unique links number	19
Total  links count 	24
Average link count 	1.3
Average links per linked word	2
Unique connectors number	17
Total  connectors count 	48
Average connector count 	2.8
Unique disjuncts number	16
Total  disjuncts count 	36
Average disjunct count 	2.2
Average disjunct length	1.3
Maximum disjunct length	2
Unique seeds number	31
Total  seeds count 	36
Average seed count 	1.2

Response beyond corpus_stats

In [12]:
{k:v for k,v in re.items() if type(v) is not list}
Out[12]:
{'start': '2019-02-22 08:23:19 UTC',
 'learn_grammar': 'v.0.7.81231',
 'project_directory': '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test',
 'terms': 'disjuncts',
 'parsed_links': 36,
 'unique_links': 31,
 'unique_words': 13,
 'unique_terms': 16,
 'word-term_pairs': 31,
 'corpus_stats_file': '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/corpus_stats.txt',
 'category_learner': 'v.0.7.81231',
 'vector_space_dim': 2,
 'silhouette': 0.8974816894026034,
 'inertia': 0.004335897519749409,
 'generalization': 'none: off',
 'cat_tree_file': '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/4_cat_tree.txt',
 'grammar_file': '/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/dict_4C_2019-02-22_0007.4.0.dict',
 'grammar_clusters': 4,
 'grammar_rules': 4,
 'finish': '2019-02-22 08:23:19 UTC',
 'grammar_learn_time': '00:00:00'}

Category tree: re['cat_tree_file']cat_tree.txt file

In [13]:
re['cat_tree_file']
Out[13]:
'/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/4_cat_tree.txt'
In [14]:
with open(re['cat_tree_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[:]: print(line)
B	0	1	0	eagle fin herring parrot tuna wing	0 0 0 0 0 0
C	0	2	0	bird extremity fish	0 0 0
D	0	3	0	isa	0
E	0	4	0	feather has scale	0 0 0
In [15]:
re['grammar_file']
Out[15]:
'/home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22/1st_test/dict_4C_2019-02-22_0007.4.0.dict'
In [16]:
with open(re['grammar_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[:]: print(line)
% Grammar Learner v.0.7 2019-02-22 08:23:19 UTC
<dictionary-version-number>: V0v0v7+;
<dictionary-locale>: EN4us+;

% B
"eagle" "fin" "herring" "parrot" "tuna" "wing":
(BD+) or (BE+) or (EB-);

% C
"bird" "extremity" "fish":
(DC-);

% D
"isa":
(BD- & DC+);

% E
"feather" "has" "scale":
(BE- & EB+) or (BE- & EE+) or (EE-);

<UNKNOWN-WORD>: XXX+;

% 4 word clusters, 4 Link Grammar rules.

More Turtle

In [ ]:
kwargs = {                              # defaults:
    # input and output files and paths:
    'input_parses'      : <input>   ,   # path to directory with input parses
    'output_grammar'    : <output>  ,   # filename or path to store Link Grammar .dict file
    # parsing:
    'max_sentence_length'   :   99  ,   # filter: max number of parsed words in sentences used for learning
    'max_unparsed_words'    :   0   ,   # filter: max number of not parsed words allowed in a sentence
    'left_wall'     :   ''          ,   # '','none' - don't use / 'LEFT-WALL' - replace ###LEFT-WALL### tag with 'LEFT-WALL'
    'period'        :   False       ,   # use full stop - end of sentence in links learning
    # word (vector) space:
    'word_space'    :   'embeddings',   # 'embeddings' / 'discrete' / sparse -- see comments below
    'context'       :   2           ,   # 1: connectors / 2: disjuncts; 
    # 'embeddings' 'word_space': 
    'dim_reduction' :   'svd'       ,   # 'svd' / 'none' for 'discrete', 'sparse' word_space
    'dim_max'       :   100         ,   # max vector space dimensionality for SVD
    'sv_min'        :   0.1         ,   # minimal singular value (fraction of the max value)
    # clustering:
    'clustering'    :   'kmeans'    ,   # 'kmeans' / 'group' / 'agglomerative'... -- see comments below
    'cluster_range' :   [2,50,1,1]  ,   # min, max, step, repeat / other options described below
    'cluster_criteria'  : 'silhouette', # optimal clustering criteria (legacy for 'kmeans' 'clustering')
    'clustering_metric' : ['silhouette', 'cosine'], # new setting (October 2018) -- comments below
    # grammar induction and generalization:
    'grammar_rules'         : 2     ,   # 1: 'connectors' / 2 - 'disjuncts'
    'rules_generalization'  : 'off' ,   # 'off' / 'hierarchical' / 'jaccard' -- see comments below 
}

Closer to reality

Basic settings

In [17]:
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.learner import learn
from src.grammar_learner.utl import UTC, test_stats
from src.grammar_learner.read_files import check_dir, check_corpus
from src.grammar_learner.write_files import list2file
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import table_rows, params, wide_rows
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
start = time.time()
runs = (1,1)
print(UTC(), ':: module_path:', module_path)
2019-02-22 09:36:44 UTC :: module_path: /home/obaskov/py/language-learning

Test corpus settings

In [18]:
corpus = 'CDS' # 'Child Directed Speech'
dataset = 'LG-E-clean'
kwargs = {
    'corpus'        :   corpus      ,
    'dataset'       :   dataset     ,
    'left_wall'     :   ''          ,
    'period'        :   False       ,
    'context'       :   2           ,
    'min_word_count':   1           ,
    'min_link_count':   1           ,
    'word_space'    :   'sparse'    ,
    'clustering'    :   ('mean_shift', 2),
    'clustering_metric' : ['silhouette', 'cosine'],
    'cluster_range' :   [0]         ,
    'top_level'     :   0.01        ,
    'grammar_rules' :   2           ,
    'max_disjuncts' :   1000000     ,
    'stop_words'    :   []          ,
    'tmpath'        :   ''          ,
    'verbose'       :   'log+'      ,
    'template_path' :   'poc-turtle',
    'linkage_limit' :   1000        }
print(UTC(), '\n', out_dir)
2019-02-22 09:36:48 UTC 
 /home/obaskov/py/language-learning/output/Grammar_Learner_Tutorial_2019-02-22

Table: pqa_table.py

In [19]:
#%%capture
from src.grammar_learner.pqa_table import table_rows, params, wide_rows
corpus  = 'CDS' # 'Child Directed Speech'
dataset = 'LG-E-clean'
rp = module_path + '/data/' + corpus + '/' + dataset
cp = rp  # corpus path = reference_path
# kwargs['reference_path'] = module_path + '/data/CDS/LG-E-clean'
lines = [['1.1', corpus, dataset, 0, 0, 'none']]
a, _, header, log, rules = wide_rows(lines, out_dir, cp, rp, runs, **kwargs)
display(html_table([header] + a)); print(test_stats(log))
mean shift clustering, bandwitdth = 2
cd: [[0 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
mean shift labels: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87   0  88  89
  90  91  92  93  94  95  96  97   0  98  99 100   0 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119   0 120 121 122
 123 124 125 126 127 128 129 130 131 132 133 134 135 136   0 137 138 139
 140 141 142 143   0 144 145 146 147 148 149 150 151 152 153 154 155   0
 156 157 158 159 160 161   0 162 163   0 164 165 166 167   0 168 169 170
 171   0 172 173   0   0   0   0 174 175 176   0 177 178 179 180 181 182
 183   0 184   0 185 186 187 188   0 189   0 190 191 192 193 194 195 196
 197 198   0 199 200 201   0   0   0 202 203 204 205   0 206 207   0   0
 208 209 210 211 212   0   0 213 214 215 216   0   0   0   0 217 218 219
   0 220 221 222   0 223 224   0   0   0 225   0 226   0 227   0 228 229
 230   0 231   0   0 232   0 233   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]
mean shift centroids: [[0.         0.06779661 0.01694915 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [1.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
mean shift clustering, bandwitdth = 2
cd: [[0 0 0 ... 0 0 0]
 [1 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
mean shift labels: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87   0  88  89
  90  91  92  93  94  95  96  97   0  98  99 100   0 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119   0 120 121 122
 123 124 125 126 127 128 129 130 131 132 133 134 135 136   0 137 138 139
 140 141 142 143   0 144 145 146 147 148 149 150 151 152 153 154 155   0
 156 157 158 159 160 161   0 162 163   0 164 165 166 167   0 168 169 170
 171   0 172 173   0   0   0   0 174 175 176   0 177 178 179 180 181 182
 183   0 184   0 185 186 187 188   0 189   0 190 191 192 193 194 195 196
 197 198   0 199 200 201   0   0   0 202 203 204 205   0 206 207   0   0
 208 209 210 211 212   0   0 213 214 215 216   0   0   0   0 217 218 219
   0 220 221 222   0 223 224   0   0   0 225   0 226   0 227   0 228 229
 230   0 231   0   0 232   0 233   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]
mean shift centroids: [[0.         0.06779661 0.01694915 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [1.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
Overal execution:   0%|          | 0/3037 [00:00<?, ?it/s]
Overal execution: 100%|██████████| 3037/3037 [00:01<00:00, 2257.56it/s]s]
CDS-LG-English-clean.ull: 100%|██████████| 3037/3037 [00:01<00:00, 2362.26sentences/s]
Overal execution: 100%|██████████| 3037/3037 [00:01<00:00, 2257.56it/s]



LineCorpusParsingSpaceLinkageAffinityG12nThresholdRulesMWCNNSIPAPQF1Top 5 cluster sizes
1.1CDSLG-E-cleandMLEdmean_shift---none---2341---0.0100%98%0.99[68, 1, 0]
Cleaned dictionary: 301 words, grammar learn time: 00:00:10, grammar test time: 00:00:01

Mutlti-line table

In [20]:
%%capture
# lines = [['1.1', corpus, dataset, 0, 0, 'none']]
lines = [
    [33, 'CDS' , 'LG-E-551'          ,0,0, 'none' ], 
    [34, 'CDS' , 'LG-E-551'          ,0,0, 'rules'], 
    [35, 'CDS' , 'R=6-W=6:R-MW=+1:R' ,0,0, 'none' ], 
    [36, 'CDS' , 'R=6-W=6:R-MW=+1:R' ,0,0, 'rules']]
a, _, header, log, rules = wide_rows(lines, out_dir, cp, rp, runs, **kwargs)
display(html_table([header] + a)); print(test_stats(log))
In [21]:
display(html_table([header] + a)); print(test_stats(log))
LineCorpusParsingSpaceLinkageAffinityG12nThresholdRulesMWCNNSIPAPQF1Top 5 cluster sizes
33CDSLG-E-551dMLEdmean_shift---none---2341---0.0100%98%0.99[68, 1, 0]
34CDSLG-E-551dMLEdmean_shift---rules---2341---0.0100%98%0.99[68, 1, 0]
35CDSR=6-W=6:R-MW=+1:RdMLEdmean_shift---none---2341---0.0100%98%0.99[68, 1, 0]
36CDSR=6-W=6:R-MW=+1:RdMLEdmean_shift---rules---2341---0.0100%98%0.99[68, 1, 0]
Cleaned dictionary: 301 words, grammar learn time: 00:00:10, grammar test time: 00:00:01

Real "Child Directed Speech" tests ~ 3 hours @ 6 cores of 12-core CPU

http://langlearn.singularitynet.io/data/clustering_2019/html/Child-Directed-Speech-2019-01-03.html

More Jupyter notebook samples: Github

GitHub.com/SingNET/Language-Learning/Notebooks