GitHub.com/SingNET/Language-Learning/src/Grammar_Learner ⇒ README
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.learner import learn_grammar
from src.grammar_learner.utl import UTC
out_dir = module_path + '/output/Grammar_Learner_Tutorial_' + str(UTC())[:10]
kwargs = {'output_grammar': out_dir + '/1st_test'}
print(UTC(), ':: module_path:', module_path, '\nout_dir:', out_dir)
kwargs['input_parses'] = module_path + '/data/POC-Turtle/MST-fixed-manually'
kwargs
with open(kwargs['input_parses'] + '/poc-turtle-parses-gold.txt', 'r') as f:
lines = f.read().splitlines()
for line in lines[:]: print(line)
learn_grammar(**kwargs)
re = learn_grammar(**kwargs)
¶re = learn_grammar(**kwargs)
type(re)
# from collections import OrderedDict
re.keys()
re['corpus_stats']
⇒ corpus_stats.txt
file¶re['corpus_stats']
re['corpus_stats_file']
with open(re['corpus_stats_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[:]: print(line)
{k:v for k,v in re.items() if type(v) is not list}
re['cat_tree_file']
⇒ cat_tree.txt
file¶re['cat_tree_file']
with open(re['cat_tree_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[:]: print(line)
re['grammar_file']
⇒ dict...dict
file¶re['grammar_file']
with open(re['grammar_file'], 'r') as f: lines = f.read().splitlines()
for line in lines[:]: print(line)
github.com/singnet/language-learning/src/grammar_learner/README.md
kwargs = { # defaults:
# input and output files and paths:
'input_parses' : <input> , # path to directory with input parses
'output_grammar' : <output> , # filename or path to store Link Grammar .dict file
# parsing:
'max_sentence_length' : 99 , # filter: max number of parsed words in sentences used for learning
'max_unparsed_words' : 0 , # filter: max number of not parsed words allowed in a sentence
'left_wall' : '' , # '','none' - don't use / 'LEFT-WALL' - replace ###LEFT-WALL### tag with 'LEFT-WALL'
'period' : False , # use full stop - end of sentence in links learning
# word (vector) space:
'word_space' : 'embeddings', # 'embeddings' / 'discrete' / sparse -- see comments below
'context' : 2 , # 1: connectors / 2: disjuncts;
# 'embeddings' 'word_space':
'dim_reduction' : 'svd' , # 'svd' / 'none' for 'discrete', 'sparse' word_space
'dim_max' : 100 , # max vector space dimensionality for SVD
'sv_min' : 0.1 , # minimal singular value (fraction of the max value)
# clustering:
'clustering' : 'kmeans' , # 'kmeans' / 'group' / 'agglomerative'... -- see comments below
'cluster_range' : [2,50,1,1] , # min, max, step, repeat / other options described below
'cluster_criteria' : 'silhouette', # optimal clustering criteria (legacy for 'kmeans' 'clustering')
'clustering_metric' : ['silhouette', 'cosine'], # new setting (October 2018) -- comments below
# grammar induction and generalization:
'grammar_rules' : 2 , # 1: 'connectors' / 2 - 'disjuncts'
'rules_generalization' : 'off' , # 'off' / 'hierarchical' / 'jaccard' -- see comments below
}
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.learner import learn
from src.grammar_learner.utl import UTC, test_stats
from src.grammar_learner.read_files import check_dir, check_corpus
from src.grammar_learner.write_files import list2file
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import table_rows, params, wide_rows
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
start = time.time()
runs = (1,1)
print(UTC(), ':: module_path:', module_path)
corpus = 'CDS' # 'Child Directed Speech'
dataset = 'LG-E-clean'
kwargs = {
'corpus' : corpus ,
'dataset' : dataset ,
'left_wall' : '' ,
'period' : False ,
'context' : 2 ,
'min_word_count': 1 ,
'min_link_count': 1 ,
'word_space' : 'sparse' ,
'clustering' : ('mean_shift', 2),
'clustering_metric' : ['silhouette', 'cosine'],
'cluster_range' : [0] ,
'top_level' : 0.01 ,
'grammar_rules' : 2 ,
'max_disjuncts' : 1000000 ,
'stop_words' : [] ,
'tmpath' : '' ,
'verbose' : 'log+' ,
'template_path' : 'poc-turtle',
'linkage_limit' : 1000 }
print(UTC(), '\n', out_dir)
pqa_table.py
¶#%%capture
from src.grammar_learner.pqa_table import table_rows, params, wide_rows
corpus = 'CDS' # 'Child Directed Speech'
dataset = 'LG-E-clean'
rp = module_path + '/data/' + corpus + '/' + dataset
cp = rp # corpus path = reference_path
# kwargs['reference_path'] = module_path + '/data/CDS/LG-E-clean'
lines = [['1.1', corpus, dataset, 0, 0, 'none']]
a, _, header, log, rules = wide_rows(lines, out_dir, cp, rp, runs, **kwargs)
display(html_table([header] + a)); print(test_stats(log))
%%capture
# lines = [['1.1', corpus, dataset, 0, 0, 'none']]
lines = [
[33, 'CDS' , 'LG-E-551' ,0,0, 'none' ],
[34, 'CDS' , 'LG-E-551' ,0,0, 'rules'],
[35, 'CDS' , 'R=6-W=6:R-MW=+1:R' ,0,0, 'none' ],
[36, 'CDS' , 'R=6-W=6:R-MW=+1:R' ,0,0, 'rules']]
a, _, header, log, rules = wide_rows(lines, out_dir, cp, rp, runs, **kwargs)
display(html_table([header] + a)); print(test_stats(log))
display(html_table([header] + a)); print(test_stats(log))
http://langlearn.singularitynet.io/data/clustering_2019/html/Child-Directed-Speech-2019-01-03.html
2018 :: http://langlearn.singularitynet.io/data/clustering_2018/ ⇒ html
2019 :: http://langlearn.singularitynet.io/data/clustering_2019/ ⇒ html