2018-02-09
¶This notebook is shared as Iterative-Clustering-ILE-POCE-CDS-2019-02-09.html,
output data -- Iterative-Clustering-ILE-POCE-CDS-2019-02-09
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC, kwa
from src.grammar_learner.read_files import check_dir
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import params
from src.grammar_learner.incremental_clustering import iterate
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
table = []
start = time.time()
out_dir = module_path + \
'/output/Iterative-Clustering-ILE-POCE-CDS-' + str(UTC())[:10]
print(UTC(), ':: out_dir:\n', out_dir)
corpus = 'GCB'
dataset = 'LG-E-clean'
input_path = module_path +'/data/'+ corpus +'/'+ dataset
ref_corpus = input_path
kwargs = {
# Corpora:
'corpus' : 'GCB' ,
'dataset' : 'LG-E-clean' ,
# 'input_parses': input_path , # paths are set by 'corpus' and 'dataset'
'reference_path': ref_corpus ,
# 'corpus_path' : ref_corpus , # corpus path = reference path
'module_path' : module_path , # language-learning dir (default)
# Word space:
'stop_words' : [] , # trash filter off
'min_word_count': 1 ,
'left_wall' : '' ,
'period' : False ,
'context' : 2 , # disjunct-based word vector space
'word_space' : 'discrete' , # "ILE"
# Category learning:
'clustering' : 'group' , #
'cluster_range' : 0 ,
'cluster_criteria' : 'silhouette',
'clustering_metric' : ('silhouette', 'cosine'),
'categories_generalization' : 'off',
# Grammar rules induction:
'grammar_rules' : 2 , # disjunct-based link grammar rules
'rules_generalization' : 'off' ,
'rules_merge' : 0.8 ,
'rules_aggregation' : 0.2 ,
'top_level' : 0.01 ,
# Etc...:
'out_path' : out_dir ,
'output_grammar': out_dir ,
'tmpath' : tmpath ,
'verbose' : 'min' ,
'template_path' : 'poc-turtle',
'linkage_limit' : 1000 ,
'iterations' : 12
}
if check_dir(input_path): print(UTC(), ':: input_path:\n', input_path)
%%capture
kwargs['corpus'] = 'POC-English-Amb'
kwargs['dataset'] = 'MST-fixed-manually'
kwargs['reference_path'] = module_path + \
'/data/POC-English-Amb/MST-fixed-manually/poc-english_ex-parses-gold.txt'
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range'] = 0 # just numbering directories
t21, re21 = iterate(**kwargs)
display(html_table(t21))
print(re21['project_directory'][42:-12])
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2 # 0.2: 28 โ 26, F1=0.99
kwargs['cluster_range'] = 2 # just numbering directories
t22, re22 = iterate(**kwargs)
display(html_table(t22))
print(re22['project_directory'][42:-12])
%%capture
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range'] = 1 # just numbering directories
t23, re23 = iterate(**kwargs)
display(html_table(t23)); print(re23['project_directory'][42:-12])
kwargs['corpus'] = 'CDS'
kwargs['reference_path'] = module_path + '/data/CDS/LG-E-clean'
if 'corpus_path' in kwargs: del kwargs['corpus_path']
%%capture
kwargs['dataset'] = 'LG-E-clean'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range'] = 0 # just numbering directories
t31, re31 = iterate(**kwargs)
display(html_table(t31)); print(re31['project_directory'][42:-12])
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2 # 0.2: 240โ231โ229, F1=0.99
kwargs['cluster_range'] = 2 # just numbering directories
t32, re32 = iterate(**kwargs)
display(html_table(t32)); print(re32['project_directory'][42:-12])
%%capture
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range'] = 1 # just numbering directories
t33, re33 = iterate(**kwargs)
display(html_table(t33)); print(re33['project_directory'][42:-12])
%%capture
kwargs['dataset'] = 'LG-E-551'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range'] = 0 # just numbering directories
t35, re35 = iterate(**kwargs) # 13+min ยป 2997 = 2997 clusters, F1=0.97
display(html_table(t35)); print(re35['project_directory'][42:-12])
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2
kwargs['cluster_range'] = 2 # just numbering directories
t36, re36 = iterate(**kwargs)
display(html_table(t36)); print(re36['project_directory'][42:-12])
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range'] = 1 # just numbering directories
t37, re37 = iterate(**kwargs)
display(html_table(t37)); print(re37['project_directory'][42:-12])
%%capture
kwargs['corpus'] = 'GCB'
kwargs['dataset'] = 'LG-E-clean' # 'LG-E-551-pq24878'
if 'reference_path' in kwargs: del kwargs['reference_path']
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['word_space'] = 'discrete'
kwargs['clustering'] = 'group'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range'] = 0
t41, r41 = iterate(**kwargs)
display(html_table(t41)); print(re41['project_directory'][42:-12])
Looks like "ILE clustering" needs performance check and improvement for larger corpora...