Iterative ILE Clustering 2018-02-28 check

This notebook is shared as Iterative-clustering-ILE-POCE-CDS-2019-02-27.html,
output data -- Iterative-clustering-ILE-POCE-CDS-2019-02-27

Looks like a tagging or parsing processing issue, OR input parses issues leading to wrong reading and tagging...*

Settings

In [1]:
import os, sys, time
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC, kwa
from src.grammar_learner.read_files import check_dir
from src.grammar_learner.widgets import html_table
from src.grammar_learner.pqa_table import params
from src.grammar_learner.incremental_clustering import iterate
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
table = []
start = time.time()
out_dir = module_path + \
    '/output/Iterative-Clustering-ILE-' + str(UTC())[:10]
print(UTC(), ':: out_dir:\n', out_dir)
2019-02-27 17:18:19 UTC :: out_dir:
 /home/obaskov/94/language-learning/output/Iterative-Clustering-ILE-2019-02-27
In [2]:
corpus = 'GCB'
dataset = 'LG-E-clean'
input_path = module_path +'/data/'+ corpus +'/'+ dataset
ref_corpus = input_path
kwargs = {
    # Corpora: 
    'corpus'        : 'GCB'         ,
    'dataset'       : 'LG-E-clean'  ,
    'max_sentence_length':  10      ,
    # 'input_parses': input_path    , # paths are set by 'corpus' and 'dataset'
    'reference_path': ref_corpus    ,
    # 'corpus_path' : ref_corpus    , # corpus path = reference path 
    'module_path'   : module_path   , # language-learning dir (default)
    # Word space:
    'stop_words'    :   []          , # trash filter off
    'min_word_count':   1           ,
    'left_wall'     :   ''          ,
    'period'        :   False       ,
    'context'       :   2           , # disjunct-based word vector space
    'word_space'    :   'discrete'  , # "ILE"
    # Category learning:
    'clustering'    :   'group'     , # "ILE"
    'cluster_range' :   0           ,
    'cluster_criteria'  : 'silhouette',
    'clustering_metric' : ('silhouette', 'cosine'),
    'categories_generalization' : 'off',
    # Grammar rules induction:
    'grammar_rules'         : 2     , # disjunct-based link grammar rules
    'rules_generalization'  : 'off' ,
    'rules_merge'           : 0.8   ,
    'rules_aggregation'     : 0.2   ,
    'top_level'             : 0.01  ,
    # Etc...:
    'out_path'      :   out_dir     ,
    'output_grammar':   out_dir     ,
    'tmpath'        :   tmpath      , 
    'verbose'       :   'min'       ,
    'template_path' :   'poc-turtle',
    'linkage_limit' :   1000        ,
    'iterations'    :   12
}
if check_dir(input_path): print(UTC(), ':: input_path:\n', input_path)
2019-02-27 17:18:19 UTC :: input_path:
 /home/obaskov/94/language-learning/data/GCB/LG-E-clean

Tests: "POC-English" corpus

"POC-English", ILE, no generalization

In [3]:
%%capture
kwargs['corpus'] = 'POC-English-Amb'
kwargs['dataset'] = 'MST-fixed-manually'
kwargs['reference_path'] = module_path + \
    '/data/POC-English-Amb/MST-fixed-manually/poc-english_ex-parses-gold.txt'
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0  # just numbering directories
if 'min_word_count' in kwargs: del kwargs['min_word_count']
t21, re21 = iterate(**kwargs)
In [4]:
display(html_table(t21))
print(re21['project_directory'][42:-12])
IterationN clustersPAF1
13498%,0.98
23498%,0.98
Iterative-Clustering-ILE-2019-02-27/POC-English-Amb_MST-fixed-manually_dILEd_no-gen_0c

"POC-English", ILE + generalization

In [5]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2
kwargs['cluster_range']     =   2  # just numbering directories
t22, re22 = iterate(**kwargs)
In [6]:
display(html_table(t22))
print(re22['project_directory'][42:-12])
IterationN clustersPAF1
126100%,0.99
226100%,0.99
Iterative-Clustering-ILE-2019-02-27/POC-English-Amb_MST-fixed-manually_dILEd_gen-rules_2c
In [7]:
%%capture
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range']     =   1  # just numbering directories
t23, re23 = iterate(**kwargs)
In [8]:
display(html_table(t23)); print(re23['project_directory'][42:-12])
IterationN clustersPAF1
123100%,0.99
218100%,0.97
317100%,0.97
417100%,0.97
Iterative-Clustering-ILE-2019-02-27/POC-English-Amb_MST-fixed-manually_dILEd_gen-rules_1c

Tests: "Child Directed Speech" corpus ("CDS")

In [9]:
kwargs['corpus'] = 'CDS'
kwargs['reference_path'] = module_path + '/data/CDS/LG-E-clean'
if 'corpus_path' in kwargs: del kwargs['corpus_path']

CDS "LG-E-clean"

In [10]:
%%capture
kwargs['dataset'] = 'LG-E-clean'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0  # just numbering directories
t31, re31 = iterate(**kwargs)
In [11]:
display(html_table(t31)); print(re31['project_directory'][42:-12])
IterationN clustersPAF1
130199%,0.99
230199%,0.99
Iterative-Clustering-ILE-2019-02-27/CDS_LG-E-clean_dILEd_no-gen_0c
In [12]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2
kwargs['cluster_range']     =   2  # just numbering directories
t32, re32 = iterate(**kwargs)
In [13]:
display(html_table(t32)); print(re32['project_directory'][42:-12])
IterationN clustersPAF1
1239100%,0.99
2230100%,0.99
3227100%,0.99
4227100%,0.99
Iterative-Clustering-ILE-2019-02-27/CDS_LG-E-clean_dILEd_gen-rules_2c
In [14]:
%%capture
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range']     =   1  # just numbering directories
t33, re33 = iterate(**kwargs)
In [15]:
display(html_table(t33)); print(re33['project_directory'][42:-12])
IterationN clustersPAF1
1137100%,0.9
245100%,0.8
314100%,0.76
48100%,0.76
54100%,0.76
63100%,0.76
Iterative-Clustering-ILE-2019-02-27/CDS_LG-E-clean_dILEd_gen-rules_1c

CDS LG-E-551

In [16]:
%%capture
kwargs['dataset'] = 'LG-E-551'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range']        =  0  # just numbering directories
t35, re35 = iterate(**kwargs)
In [17]:
display(html_table(t35)); print(re35['project_directory'][42:-12])
IterationN clustersPAF1
12578100%,0.98
22578100%,0.98
Iterative-Clustering-ILE-2019-02-27/CDS_LG-E-551_dILEd_no-gen_0c
In [18]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.2  # 0.2: 2148⇒2091, 0.98⇒0.91
kwargs['cluster_range']     =   2  # just numbering directories
t36, re36 = iterate(**kwargs)
In [19]:
display(html_table(t36)); print(re36['project_directory'][42:-12])
IterationN clustersPAF1
11823100%,0.98
21769100%,0.98
31761100%,0.97
41761100%,0.97
Iterative-Clustering-ILE-2019-02-27/CDS_LG-E-551_dILEd_gen-rules_2c
In [20]:
%%capture
kwargs['rules_generalization'] = 'fast'
kwargs['rules_aggregation'] = 0.1
kwargs['cluster_range']     =   1  # just numbering directories
t37, re37 = iterate(**kwargs)
In [21]:
display(html_table(t37)); print(re37['project_directory'][42:-12])
IterationN clustersPAF1
11125100%,0.87
2631100%,0.83
3446100%,0.77
4343100%,0.76
5213100%,0.76
6181100%,0.76
7165100%,0.76
8165100%,0.76
Iterative-Clustering-ILE-2019-02-27/CDS_LG-E-551_dILEd_gen-rules_1c

"Gutenberg Children Books" corpus

In [27]:
%%capture
kwargs['corpus'] = 'GCB'
kwargs['dataset'] = 'LG-E-clean'
if 'reference_path' in kwargs: del kwargs['reference_path']
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['word_space'] = 'discrete'
kwargs['clustering'] = 'group'
kwargs['rules_generalization'] = 'off'
kwargs['cluster_range'] =  0  # just numbering directories
kwargs['min_word_count'] = 21
t40, r40 = iterate(**kwargs)
In [28]:
display(html_table(t40)); print(r40['project_directory'][42:-12])
IterationN clustersPAF1
1112724%,0.26
227710%,0.09
329910%,0.09
427710%,0.09
529910%,0.09
627710%,0.09
729910%,0.09
827710%,0.09
929910%,0.09
1027710%,0.09
1129910%,0.09
1227710%,0.09
Iterative-Clustering-ILE-2019-02-27/GCB_LG-E-clean_dILEd_no-gen_0c_mwc=21/

Looks like a tagging or corpus processing issue... OR some issue in input parses leading to wrong reading and tagging?

In [33]:
%%capture
kwargs['corpus'] = 'GCB'
kwargs['dataset'] = 'LG-E-clean'  # 'LG-E-551-pq24878'
if 'reference_path' in kwargs: del kwargs['reference_path']
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['word_space'] = 'discrete'
kwargs['clustering'] = 'group'
kwargs['rules_generalization'] = 'fast'
kwargs['cluster_range']     =   3  # just numbering directories
kwargs['min_word_count'] = 11
kwargs['rules_aggregation'] = 0.2
t43, r43 = iterate(**kwargs)
In [34]:
display(html_table(t43)); print(r43['project_directory'][42:-12])
IterationN clustersPAF1
1179526%,0.28
248810%,0.1
354910%,0.1
448910%,0.09
554910%,0.1
648910%,0.09
754910%,0.1
848910%,0.09
954910%,0.1
1048910%,0.09
1154910%,0.1
1248910%,0.09
Iterative-Clustering-ILE-2019-02-27/GCB_LG-E-clean_dILEd_gen-rules_3c_mwc=11/
In [35]:
%%capture
kwargs['corpus'] = 'GCB'
kwargs['dataset'] = 'LG-E-clean'  # 'LG-E-551-pq24878'
if 'reference_path' in kwargs: del kwargs['reference_path']
if 'corpus_path' in kwargs: del kwargs['corpus_path']
kwargs['word_space'] = 'discrete'
kwargs['clustering'] = 'group'
kwargs['rules_generalization'] = 'fast'
kwargs['cluster_range']     =   4  # just numbering directories
kwargs['min_word_count'] = 11
kwargs['rules_aggregation'] = 0.3
t44, r44 = iterate(**kwargs)
In [36]:
display(html_table(t44)); print(r44['project_directory'][42:-12])
IterationN clustersPAF1
1184926%,0.28
249010%,0.09
355010%,0.1
449010%,0.09
555010%,0.1
649010%,0.09
755010%,0.1
849010%,0.09
955010%,0.1
1049010%,0.09
1155010%,0.1
1249010%,0.09
Iterative-Clustering-ILE-2019-02-27/GCB_LG-E-clean_dILEd_gen-rules_4c_mwc=11/