2018-02-09
¶Checking "Incremental Iterative Clustering with Continuous Dimension Reduction" -- last page in the "ULL 2019 - Report".
This notebook is shared as Iterative-Clustering-4-sentences-2019-02-09.html,
output data -- Iterative-Clustering-4-sentences-2019-02-09.
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC, kwa
from src.grammar_learner.read_files import check_dir
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import params
from src.grammar_learner.incremental_clustering import iterate
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
table = []
start = time.time()
out_dir = module_path + \
'/output/Iterative-Clustering-4-sentences-' + str(UTC())[:10]
print(UTC(), ':: out_dir:\n', out_dir)
corpus = '4_sentences'
dataset = 'sequence_1234'
input_path = module_path +'/data/'+ corpus +'/'+ dataset
ref_corpus = input_path
kwargs = {
# Corpora:
'corpus' : corpus ,
'dataset' : dataset ,
# 'input_parses' : input_path , # paths are set by 'corpus' and 'dataset'
# 'reference_path': ref_corpus , # reference_path = input_parses
# 'corpus_path' : ref_corpus , # corpus path = reference path
'module_path' : module_path , # language-learning dir (default)
# Word space:
'stop_words' : [] , # trash filter off
'min_word_count': 1 ,
'left_wall' : '' ,
'period' : False ,
'context' : 2 , # disjunct-based word vector space
'word_space' : 'discrete' , # "ILE"
# Category learning:
'clustering' : 'group' , #
'cluster_range' : 0 , # not used in ILE, can be used to mark dirs
'cluster_criteria' : 'silhouette',
'clustering_metric' : ('silhouette', 'cosine'),
'categories_generalization' : 'off',
# Grammar rules induction:
'grammar_rules' : 2 , # disjunct-based link grammar rules
'rules_generalization' : 'off' ,
'rules_merge' : 0.8 ,
'rules_aggregation' : 0.2 ,
'top_level' : 0.01 ,
# Etc...:
'out_path' : out_dir ,
'output_grammar': out_dir ,
'tmpath' : tmpath ,
'verbose' : 'min' ,
'template_path' : 'poc-turtle',
'linkage_limit' : 1000 ,
'iterations' : 12
}
if check_dir(input_path): print(UTC(), ':: input_path:\n', input_path)
kwargs['cluster_range'] = 6 # just marking dir
with open(input_path + '/4_sentences_1234.ull', 'r') as f:
lines = f.read().splitlines()
for line in lines: print(line)
%%capture
kwargs['context'] = 1 # connector-based word space
kwargs['grammar_rules'] = 1 # connector-based grammar rules
t1, re1 = iterate(**kwargs)
display(html_table(t1)); print(re1['project_directory'][42:-12])
%%capture
kwargs['context'] = 1 # connector-based word space
kwargs['grammar_rules'] = 2 # disjunct-based grammar rules
t2, re2 = iterate(**kwargs)
display(html_table(t2)); print(re2['project_directory'][42:-12])
%%capture
kwargs['context'] = 2 # disjunct-based word space
kwargs['grammar_rules'] = 2 # disjunct-based grammar rules
t3, re3 = iterate(**kwargs)
display(html_table(t3)); print(re3['project_directory'][42:-12])
Last line modification: parrot (eats...) ⇒ tuna ... to enable grouping tuna and parrot
kwargs['dataset'] = 'modified'
kwargs['cluster_range'] = 4 # just marking dir
with open(module_path + '/data/4_sentences/modified/modified_1234.ull', 'r') as f:
lines = f.read().splitlines()
for line in lines: print(line)
%%capture
kwargs['context'] = 2 # disjunct-based word space
kwargs['grammar_rules'] = 2 # disjunct-based grammar rules
t4, re4 = iterate(**kwargs)
display(html_table(t4)); print(re4['project_directory'][42:-12])
%%capture
kwargs['context'] = 1 # connector based word space
kwargs['grammar_rules'] = 1 # connector based grammar rules
t5, re5 = iterate(**kwargs)
display(html_table(t5)); print(re5['project_directory'][42:-12])
%%capture
kwargs['context'] = 1 # connector based word space
kwargs['grammar_rules'] = 2 # disjunct based grammar rules
t6, re6 = iterate(**kwargs)
display(html_table(t6)); print(re6['project_directory'][42:-12])