Common settings

In [1]:
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.utl.utl import UTC
from src.utl.read_files import check_dir
print(UTC(), module_path)
prefix = '' # unised option
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
date = '2018-04-13'
verbose='mid'
2018-04-13 10:16:27 UTC /home/oleg/language-learning

Call Grammar Learner

In [2]:
def gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose='mid'):
    from src.utl.read_files import check_dir
    from src.grammar_learner.poc03 import learn_grammar
    parse_mode = 'given'            # 'given' (default) / 'explosive' (next)
    left_wall = 'LEFT-WALL'         # '','none' - don't use / 'LW' - replace ###LEFT-WALL### with 'LW'
    period = True                   # use period in links learning: True/False
    #_context = 1                     # 1: connectors / 0: 'words' /
    #_context = 2                     # 2,3...: disjuncts with limited number of links
    window = 'mst'                  # 'mst' / reserved options for «explosive» parsing
    weighting = 'ppmi'              # 'ppmi' / future weighting options
    group = True                    # group items after link parsing, sum counts 
    #+distance = False?
    word_space = 'vectors'          # 'vectors' / 'discrete' - no dimensionality reduction
    dim_max = 100                   # max vector space dimensionality
    sv_min = 0.1                    # minimal singular value (fraction of the max value)
    dim_reduction = 'svm'           # 'svm' / 'none' (discrete word_space, group)
    clustering = 'kmeans'           # 'kmeans' / 'group'~'identical_entries' / future options
    cluster_range = (2,48,1)        # min, max, step
    cluster_criteria = 'silhouette'
    cluster_level = 0.9             # level = 0, 1, 0.-0.99..: 0 - max number of clusters
    generalization = 'off'          # 'off' / future options: 'cosine', ...
    merge = 0.8                     # merge clusters with similarity > this 'merge' criteria
    aggregate = 0.2                 # agglomerate clusters with similarity > this criteria
    #_grammar_rules = 2               # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?)
    #_verbose='mid'   # display intermediate results: 'none', 'min', 'mid', 'max'

    if check_dir(prj_dir, create=True, verbose='none'):
        cat_path = prj_dir  # Path to store learned categories
        dict_path = prj_dir # Path to store learned dictionary Link Grammar file

    lg_rules_str = learn_grammar(input_dir, cat_path, dict_path, tmpath, verbose, \
        parse_mode, left_wall, period, context, window, weighting, group, \
        word_space, dim_max, sv_min, dim_reduction, \
        clustering, cluster_range, cluster_criteria, cluster_level,
        generalization, merge, aggregate, grammar_rules)

    return lg_rules_str.split('\n')[-1][2:]

print(UTC())
2018-04-13 10:16:27 UTC

POC-Turtle, MST fixed manually

In [3]:
input_dir = module_path + '/data/POC_Turtle/MST_fixed_manually/'
batch_dir = module_path + '/output/Pictures-'+date+'/POC-Turtle/MST_fixed_manually/'

MST fixed manually, Connectors-DRK-Connectors

In [4]:
prj_dir = batch_dir + 'connectors-DRK-connectors/'
context = 1
grammar_rules = 1
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
15 unique words and 28 unique links form 64 unique word-link pairs from 64 parsed items
Vector space dimensionality = 7
Singular value (7) = 3.4
Max singular value (1) = 5.6
Singular value (7) = 0.19275911971807783
Silhouette index in a range of cluster numbers
Optimal number of clusters: 4
Cluster words in vector space, axes 1 and 2
ParentCategoryQualityWordsRelevance
C00C010.0['.', 'LEFT-WALL', 'has', 'isa'][0, 0, 0, 0]
C00C020.0['bird', 'extremity', 'fish'][0, 0, 0]
C00C030.0['feather', 'fin', 'scale', 'wing'][0, 0, 0, 0]
C00C040.0['eagle', 'herring', 'parrot', 'tuna'][0, 0, 0, 0]
Category list - 4 lines, saved to /home/oleg/language-learning/output/Pictures-2018-04-13/POC-Turtle/MST_fixed_manually/connectors-DRK-connectors/categories.txt
list
ClusterGermsLRDisjuncts
C01['.', 'LEFT-WALL', 'has', 'isa'][][]['C02C01- or C03C01-', 'C01C03+ or C01C04+', '{C03C01- or C04C01-} & {C01C03+}', '{C03C01- or C04C01-} & {C01C02+}']
C02['bird', 'extremity', 'fish'][][]['{C01C02-} & {C02C01+}', '{C01C02-} & {C02C01+}', '{C01C02-} & {C02C01+}']
C03['feather', 'fin', 'scale', 'wing'][][]['{C01C03-} & {C03C01+}', '{C01C03-} & {C03C01+}', '{C01C03-} & {C03C01+}', '{C01C03-} & {C03C01+}']
C04['eagle', 'herring', 'parrot', 'tuna'][][]['{C01C04-} & {C04C01+}', '{C01C04-} & {C04C01+}', '{C01C04-} & {C04C01+}', '{C01C04-} & {C04C01+}']
Out[4]:
'Link Grammar file saved to: /home/oleg/language-learning/output/Pictures-2018-04-13/POC-Turtle/MST_fixed_manually/connectors-DRK-connectors/poc-english_4C_2018-04-13_0007.4.0.dict'

MST fixed manually, Disjuncts-DRK-Disjuncts

In [5]:
prj_dir = batch_dir + 'disjuncts-DRK-disjuncts/'
context = 2
grammar_rules = 2
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
15 unique words and 29 unique links form 44 unique word-link pairs from 60 parsed items
Vector space dimensionality = 8
Singular value (8) = 3.7
Max singular value (1) = 5.6
Singular value (8) = 3.7385588793217883e-10
Silhouette index in a range of cluster numbers
Optimal number of clusters: 4
Cluster words in vector space, axes 1 and 3
ParentCategoryQualityWordsRelevance
C00C010.0['.', 'LEFT-WALL', 'has', 'isa'][0, 0, 0, 0]
C00C020.0['feather', 'fin', 'scale', 'wing'][0, 0, 0, 0]
C00C030.0['bird', 'extremity', 'fish'][0, 0, 0]
C00C040.0['eagle', 'herring', 'parrot', 'tuna'][0, 0, 0, 0]
Category list - 4 lines, saved to /home/oleg/language-learning/output/Pictures-2018-04-13/POC-Turtle/MST_fixed_manually/disjuncts-DRK-disjuncts/categories.txt
list
ClusterGermsLRDisjuncts
C01['.', 'LEFT-WALL', 'has', 'isa'][][]['C03C01-', 'C02C01-', 'C01C04+', 'C01C02+', 'C04C01- & C01C02+', 'C02C01- & C01C02+', 'C02C01- & C01C03+', 'C04C01- & C01C03+']
C02['feather', 'fin', 'scale', 'wing'][][]['C01C02- & C02C01+', 'C01C02- & C02C01+', 'C01C02- & C02C01+', 'C01C02- & C02C01+']
C03['bird', 'extremity', 'fish'][][]['C01C03- & C03C01+', 'C01C03- & C03C01+', 'C01C03- & C03C01+']
C04['eagle', 'herring', 'parrot', 'tuna'][][]['C01C04- & C04C01+', 'C01C04- & C04C01+', 'C01C04- & C04C01+', 'C01C04- & C04C01+']
Out[5]:
'Link Grammar file saved to: /home/oleg/language-learning/output/Pictures-2018-04-13/POC-Turtle/MST_fixed_manually/disjuncts-DRK-disjuncts/poc-english_4C_2018-04-13_0007.4.0.dict'

POC-English-NoAmb, MST fixed manually

In [6]:
input_dir = module_path + '/data/POC_English_NoAmb/MST_fixed_manually/'
batch_dir = module_path + '/output/Pictures-'+date+'/POC_English_NoAmb/MST_fixed_manually/'

MST fixed manually, Connectors-DRK-Connectors

In [7]:
prj_dir = batch_dir + 'connectors-DRK-connectors/'
context = 1
grammar_rules = 1
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
20 unique words and 30 unique links form 104 unique word-link pairs from 104 parsed items
Vector space dimensionality = 14
Singular value (14) = 1.1
Max singular value (1) = 7.3
Singular value (14) = 0.12488603977558205
Silhouette index in a range of cluster numbers
Optimal number of clusters: 9
Cluster words in vector space, axes 1 and 2
ParentCategoryQualityWordsRelevance
C00C010.0['is', 'was'][0, 0]
C00C020.0['liked', 'likes'][0, 0]
C00C030.0['LEFT-WALL', 'a'][0, 0]
C00C040.0['cake', 'sausage'][0, 0]
C00C050.0['dad', 'mom'][0, 0]
C00C060.0['daughter', 'son'][0, 0]
C00C070.0['before', 'not'][0, 0]
C00C080.0['child', 'food', 'human', 'now', 'parent'][0, 0, 0, 0, 0]
C00C090.0['.'][0]
Category list - 9 lines, saved to /home/oleg/language-learning/output/Pictures-2018-04-13/POC_English_NoAmb/MST_fixed_manually/connectors-DRK-connectors/categories.txt
list
ClusterGermsLRDisjuncts
C01['is', 'was'][][]['{C03C01- or C04C01- or C05C01- or C06C01-} & {C01C08+}', '{C03C01- or C04C01- or C05C01-} & {C01C06+ or C01C07+ or C01C08+}']
C02['liked', 'likes'][][]['{C03C02- or C05C02-} & {C02C04+ or C02C07+}', '{C03C02- or C05C02- or C06C02-} & {C02C04+ or C02C08+}']
C03['LEFT-WALL', 'a'][][]['C03C01+ or C03C02+ or C03C04+ or C03C05+ or C03C06+ or C03C09+', 'C03C05+ or C03C06+ or C03C08+']
C04['cake', 'sausage'][][]['{C02C04- or C03C04-} & {C04C01+}', '{C02C04- or C03C04-} & {C04C01+}']
C05['dad', 'mom'][][]['{C03C05-} & {C05C01+ or C05C02+}', '{C03C05-} & {C05C01+ or C05C02+}']
C06['daughter', 'son'][][]['{C01C06- or C03C06-} & {C06C01+ or C06C02+}', '{C01C06- or C03C06-} & {C06C01+ or C06C02+}']
C07['before', 'not'][][]['C01C07- or C02C07-', 'C01C07-']
C08['child', 'food', 'human', 'now', 'parent'][][]['C01C08- or C03C08-', 'C01C08- or C03C08-', 'C01C08- or C03C08-', 'C01C08- or C02C08-', 'C01C08- or C03C08-']
C09['.'][][]['C03C09-']
Out[7]:
'Link Grammar file saved to: /home/oleg/language-learning/output/Pictures-2018-04-13/POC_English_NoAmb/MST_fixed_manually/connectors-DRK-connectors/poc-english_9C_2018-04-13_0007.4.0.dict'

MST fixed manually, Disjuncts-DRK-Disjuncts

In [8]:
prj_dir = batch_dir + 'disjuncts-DRK-disjuncts/'
context = 2
grammar_rules = 2
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
20 unique words and 73 unique links form 102 unique word-link pairs from 242 parsed items
Vector space dimensionality = 15
Singular value (15) = 1.5
Max singular value (1) = 11.1
All singular values within relevance interval - more than 0.1 of max singular value
Silhouette index in a range of cluster numbers
Optimal number of clusters: 10
Cluster words in vector space, axes 1 and 4
ParentCategoryQualityWordsRelevance
C00C010.0['LEFT-WALL', 'is', 'likes', 'now'][0, 0, 0, 0]
C00C020.0['daughter', 'son'][0, 0]
C00C030.0['child', 'human'][0, 0]
C00C040.0['cake', 'sausage'][0, 0]
C00C050.0['food', 'parent'][0, 0]
C00C060.0['before', 'not'][0, 0]
C00C070.0['dad', 'mom'][0, 0]
C00C080.0['.', 'was'][0, 0]
C00C090.0['a'][0]
C00C100.0['liked'][0]
Category list - 10 lines, saved to /home/oleg/language-learning/output/Pictures-2018-04-13/POC_English_NoAmb/MST_fixed_manually/disjuncts-DRK-disjuncts/categories.txt
list
ClusterGermsLRDisjuncts
C01['LEFT-WALL', 'is', 'likes', 'now'][][]['C01C01+ & C01C08+ & C01C02+', 'C01C04+ & C01C08+ & C01C01+', 'C01C07+ & C01C08+ & C01C08+', 'C01C08+ & C01C01+ & C01C07+', 'C01C01+ & C01C04+ & C01C08+', 'C01C02+ & C01C01+ & C01C08+', 'C01C02+ & C01C08+ & C01C01+', 'C01C01+ & C01C07+ & C01C08+', 'C01C01+ & C01C08+ & C01C07+', 'C01C08+ & C01C07+ & C01C08+', 'C01C10+ & C01C07+ & C01C08+', 'C01C08+ & C01C10+ & C01C07+', 'C01C04+ & C01C08+ & C01C08+', 'C01C01- & C02C01- & C01C01+ & C01C03+', 'C01C01- & C04C01- & C01C05+', 'C01C01- & C07C01- & C01C05+', 'C01C01- & C07C01- & C01C01+ & C01C05+', 'C01C01- & C02C01- & C01C03+', 'C01C01- & C07C01- & C01C03+', 'C01C01- & C04C01- & C01C01+ & C01C05+', 'C01C01- & C07C01- & C01C01+ & C01C03+', 'C01C01- & C07C01- & C01C04+', 'C01C01- & C02C01- & C01C01+ & C01C04+', 'C01C01- & C02C01- & C01C04+', 'C01C01- & C02C01- & C01C04+ & C01C01+', 'C01C01- & C07C01- & C01C04+ & C01C01+', 'C01C01- & C07C01- & C01C01+ & C01C04+', 'C01C01-']
C02['daughter', 'son'][][]['C08C02- & C09C02-', 'C01C02- & C09C02- & C02C01+', 'C01C02- & C02C01+', 'C08C02- & C09C02-', 'C01C02- & C09C02- & C02C01+', 'C01C02- & C02C01+']
C03['child', 'human'][][]['C09C03- & C01C03-', 'C09C03- & C01C03-']
C04['cake', 'sausage'][][]['C01C04-', 'C01C04- & C04C08+', 'C10C04-', 'C01C04- & C04C01+', 'C01C04-', 'C01C04- & C04C08+', 'C10C04-', 'C01C04- & C04C01+']
C05['food', 'parent'][][]['C09C05- & C01C05-', 'C08C05- & C09C05-', 'C09C05- & C01C05-', 'C08C05- & C09C05-']
C06['before', 'not'][][]['C10C06-', 'C08C06-', 'C08C06-']
C07['dad', 'mom'][][]['C01C07- & C07C01+', 'C01C07- & C09C07- & C07C01+', 'C01C07- & C07C08+', 'C01C07- & C07C10+', 'C01C07- & C07C01+', 'C01C07- & C09C07- & C07C01+', 'C01C07- & C07C08+', 'C01C07- & C07C10+']
C08['.', 'was'][][]['C01C08-', 'C01C08- & C07C08- & C08C02+ & C08C06+', 'C07C08- & C01C08- & C08C06+ & C08C06+ & C08C05+', 'C01C08- & C04C08- & C08C06+ & C08C05+', 'C01C08- & C07C08- & C08C06+ & C08C02+']
C09['a'][][]['C09C05+', 'C09C07+', 'C09C03+', 'C09C02+']
C10['liked'][][]['C01C10- & C07C10- & C10C04+ & C10C06+', 'C01C10- & C07C10- & C10C06+ & C10C04+']
Out[8]:
'Link Grammar file saved to: /home/oleg/language-learning/output/Pictures-2018-04-13/POC_English_NoAmb/MST_fixed_manually/disjuncts-DRK-disjuncts/poc-english_10C_2018-04-13_0007.4.0.dict'