2018-02-03
: intermediate¶First tests of unstable iterative clustering prototype.
This notebook is temporarily shared as static _iterative_clustering_2019-02-03_.html
,
data shared via _iterative_clustering_2019-02-03_
folder.
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC, kwa
from src.grammar_learner.read_files import check_dir
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import params
from src.grammar_learner.incremental_clustering import iterate
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
table = []
start = time.time()
out_dir = module_path + '/output/_iterative_clustering_' + str(UTC())[:10] + '_'
print(UTC(), ':: out_dir:\n', out_dir)
corpus = 'POC-English-Amb'
dataset = 'MST-fixed-manually'
input_path = module_path +'/data/'+ corpus +'/'+ dataset
ref_corpus = module_path + '/data/POC-English-Amb/MST-fixed-manually/poc-english_ex-parses-gold.txt'
kwargs = {
# Corpora:
'corpus' : 'POC-English-Amb',
'dataset' : 'MST-fixed-manually',
# 'input_parses': input_path , # paths are set by 'corpus' and 'dataset'
'reference_path': ref_corpus ,
'corpus_path' : ref_corpus , # corpus path = reference path
'module_path' : module_path , # language-learning dir (default)
# Word space:
'stop_words' : [] , # trash filter off
'min_word_count': 1 ,
'left_wall' : '' ,
'period' : False ,
'context' : 2 , # disjunct-based word vector space
'word_space' : 'vectors' , # "DRK"
# Category learning:
'clustering' : ('kmeans', 'kmeans++', 10),
'cluster_range' : (2,50,1,5) ,
'cluster_criteria' : 'silhouette',
'clustering_metric' : ('silhouette', 'cosine'),
'categories_generalization' : 'off',
# Grammar rules induction:
'grammar_rules' : 2 , # disjunct-based link grammar rules
'rules_generalization' : 'off' ,
'rules_merge' : 0.8 ,
'rules_aggregation' : 0.2 ,
'top_level' : 0.01 ,
# Etc...:
'out_path' : out_dir ,
'output_grammar': out_dir ,
'tmpath' : tmpath ,
'verbose' : 'min' ,
'template_path' : 'poc-turtle',
'linkage_limit' : 1000 ,
'iterations' : 7
}
print(UTC()) #, ':: input_path:\n', input_path, '\nout_path:', kwargs['out_path'])
%%capture
kwargs['context'] = 1
kwargs['grammar_rules'] = 1
table, re = iterate(**kwargs)
display(html_table(table))
print(re['project_directory'][42:-12])
%%capture
kwargs['grammar_rules'] = 2
table, re = iterate(**kwargs)
display(html_table(table))
print(re['project_directory'][42:-12])
%%capture
kwargs['context'] = 2
table, re = iterate(**kwargs)
display(html_table(table))
print(re['project_directory'][42:-12])
%%capture
kwargs['word_space'] = 'discrete'
kwargs['clustering'] = 'group'
table, re = iterate(**kwargs)
display(html_table(table))
print(re['project_directory'][42:-12])
%%capture
kwargs['word_space'] = 'sparse'
kwargs['cluster_range'] = (2,36,1,1)
kwargs['clustering'] = ('agglomerative', 'ward')
table, re = iterate(**kwargs)
display(html_table(table))
print(re['project_directory'][42:-12])