Iterative ILE Clustering 2018-02-09

This notebook is shared as Iterative-Clustering-ILE-POCE-CDS-2019-02-09.html,
output data -- Iterative-Clustering-ILE-POCE-CDS-2019-02-09

Settings

In [1]:
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC, kwa
from src.grammar_learner.read_files import check_dir
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import params
from src.grammar_learner.incremental_clustering import iterate
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
table = []
start = time.time()
out_dir = module_path + \
    '/output/Iterative-Clustering-ILE-POCE-CDS-' + str(UTC())[:10]
print(UTC(), ':: out_dir:\n', out_dir)
2019-02-09 10:46:39 UTC :: out_dir:
 /home/obaskov/94/language-learning/output/Iterative-Clustering-ILE-POCE-CDS-2019-02-09
In [2]:
corpus = 'GCB'
dataset = 'LG-E-clean'
input_path = module_path +'/data/'+ corpus +'/'+ dataset
ref_corpus = input_path
kwargs = {
    # Corpora: 
    'corpus'        : 'GCB'         ,
    'dataset'       : 'LG-E-clean'  ,
    # 'input_parses': input_path    , # paths are set by 'corpus' and 'dataset'
    'reference_path': ref_corpus    ,
    # 'corpus_path' : ref_corpus    , # corpus path = reference path 
    'module_path'   : module_path   , # language-learning dir (default)
    # Word space:
    'stop_words'    :   []          , # trash filter off
    'min_word_count':   1           ,
    'left_wall'     :   ''          ,
    'period'        :   False       ,
    'context'       :   2           , # disjunct-based word vector space
    'word_space'    :   'discrete'  , # "ILE"
    # Category learning:
    'clustering'    :   'group'     , # 
    'cluster_range' :   0           ,
    'cluster_criteria'  : 'silhouette',
    'clustering_metric' : ('silhouette', 'cosine'),
    'categories_generalization' : 'off',
    # Grammar rules induction:
    'grammar_rules'         : 2     , # disjunct-based link grammar rules
    'rules_generalization'  : 'off' ,
    'rules_merge'           : 0.8   ,
    'rules_aggregation'     : 0.2   ,
    'top_level'             : 0.01  ,
    # Etc...:
    'out_path'      :   out_dir     ,
    'output_grammar':   out_dir     ,
    'tmpath'        :   tmpath      , 
    'verbose'       :   'min'       ,
    'template_path' :   'poc-turtle',
    'linkage_limit' :   1000        ,
    'iterations'    :   12
}
if check_dir(input_path): print(UTC(), ':: input_path:\n', input_path)
2019-02-09 10:46:39 UTC :: input_path:
 /home/obaskov/94/language-learning/data/GCB/LG-E-clean

Tests: "POC-English"

"POC-English", ILE, no generalization

In [3]:
%%capture
kwargs['corpus'] = 'POC-English-Amb'
kwargs['dataset'] = 'MST-fixed-manually'
kwargs['reference_path'] = module_path + \
    '/data/POC-English-Amb/MST-fixed-manually/poc-english_ex-parses-gold.txt'
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0  # just numbering directories
t21, re21 = iterate(**kwargs)
In [4]:
display(html_table(t21))
print(re21['project_directory'][42:-12])
IterationN clustersPAF1
137100%,1.0
237100%,1.0
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/POC-English-Amb_MST-fixed-manually_dILEd_no-gen_0c

"POC-English", ILE + generalization

In [5]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2       # 0.2: 28 โ‡’ 26, F1=0.99
kwargs['cluster_range']     =   2  # just numbering directories
t22, re22 = iterate(**kwargs)
In [6]:
display(html_table(t22))
print(re22['project_directory'][42:-12])
IterationN clustersPAF1
128100%,0.99
226100%,0.99
326100%,0.99
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/POC-English-Amb_MST-fixed-manually_dILEd_gen-rules_2c
In [7]:
%%capture
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range']     =   1  # just numbering directories
t23, re23 = iterate(**kwargs)
In [8]:
display(html_table(t23)); print(re23['project_directory'][42:-12])
IterationN clustersPAF1
125100%,0.99
218100%,0.97
317100%,0.97
417100%,0.97
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/POC-English-Amb_MST-fixed-manually_dILEd_gen-rules_1c

Tests: "Child Directed Speech"

In [9]:
kwargs['corpus'] = 'CDS'
kwargs['reference_path'] = module_path + '/data/CDS/LG-E-clean'
if 'corpus_path' in kwargs: del kwargs['corpus_path']

CDS LG-E-clean

In [10]:
%%capture
kwargs['dataset'] = 'LG-E-clean'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0  # just numbering directories
t31, re31 = iterate(**kwargs)
In [11]:
display(html_table(t31)); print(re31['project_directory'][42:-12])
IterationN clustersPAF1
1301100%,0.99
2301100%,0.99
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/CDS_LG-E-clean_dILEd_no-gen_0c
In [12]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2  # 0.2: 240โ‡’231โ‡’229, F1=0.99
kwargs['cluster_range']     =   2  # just numbering directories
t32, re32 = iterate(**kwargs)
In [13]:
display(html_table(t32)); print(re32['project_directory'][42:-12])
IterationN clustersPAF1
1240100%,0.99
2231100%,0.99
3229100%,0.99
4229100%,0.99
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/CDS_LG-E-clean_dILEd_gen-rules_2c
In [14]:
%%capture
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range']     =   1  # just numbering directories
t33, re33 = iterate(**kwargs)
In [15]:
display(html_table(t33)); print(re33['project_directory'][42:-12])
IterationN clustersPAF1
1137100%,0.91
251100%,0.83
317100%,0.77
410100%,0.75
54100%,0.75
63100%,0.75
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/CDS_LG-E-clean_dILEd_gen-rules_1c

CDS LG-E-551

In [16]:
%%capture
kwargs['dataset'] = 'LG-E-551'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0  # just numbering directories
t35, re35 = iterate(**kwargs)    # 13+min ยป 2997 = 2997 clusters, F1=0.97
In [17]:
display(html_table(t35)); print(re35['project_directory'][42:-12])
IterationN clustersPAF1
12997100%,0.97
22997100%,0.97
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/CDS_LG-E-551_dILEd_no-gen_0c
In [18]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2
kwargs['cluster_range']     =   2  # just numbering directories
t36, re36 = iterate(**kwargs)
In [19]:
display(html_table(t36)); print(re36['project_directory'][42:-12])
IterationN clustersPAF1
12148100%,0.98
22091100%,0.97
32091100%,0.97
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/CDS_LG-E-551_dILEd_gen-rules_2c
In [20]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range']     =   1  # just numbering directories
t37, re37 = iterate(**kwargs)
In [21]:
display(html_table(t37)); print(re37['project_directory'][42:-12])
IterationN clustersPAF1
11350100%,0.87
2772100%,0.84
3609100%,0.78
4461100%,0.77
5362100%,0.76
6280100%,0.75
7215100%,0.75
8193100%,0.76
9172100%,0.75
10172100%,0.75
Iterative-Clustering-ILE-POCE-CDS-2019-02-09/CDS_LG-E-551_dILEd_gen-rules_1c/

"Gutenberg Children Books"

In [ ]:
%%capture
kwargs['corpus'] = 'GCB'
kwargs['dataset'] = 'LG-E-clean'  # 'LG-E-551-pq24878'
if 'reference_path' in kwargs: del kwargs['reference_path']
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['word_space'] = 'discrete'
kwargs['clustering'] = 'group'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0
t41, r41 = iterate(**kwargs)
In [ ]:
display(html_table(t41)); print(re41['project_directory'][42:-12])

Looks like "ILE clustering" needs performance check and improvement for larger corpora...