2018-04-22
¶Data for AGI-2018 paper: learn grammar with various Grammar Learner settings using various parses of "POC-English-NoAmb" corpus.
Tests: "POC-Turtle" corpus: http://88.99.210.144/data/clustering_2018/data/POC_Turtle/;
Test results: http://88.99.210.144/data/clustering_2018/AGI-2018/POC-Turtle/.
A static html copy of this notebook is shared via
http://88.99.210.144/data/clustering_2018/html/POC-Turtle-8-Test-MST-parses.html
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.utl.utl import UTC
from src.utl.read_files import check_dir
from src.grammar_learner.poc03 import learn_grammar
input_dir = module_path + '/data/POC-Turtle/MST_fixed_manually/'
print('Baseline dataset:\n-', input_dir)
input_batch = module_path + '/data/POC-Turtle/'
dirs = sorted([x[0] for x in os.walk(input_batch)])[1:]
print('Input data directory structure (input_batch subdirs):')
for d in dirs: print('-',d)
out_dir = module_path + '/output/AGI-2018-paper-data-' + str(UTC())[:10] + '/'
print('Output data directory (out_dir):\n-', out_dir)
prefix = '' # unused option
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
print(UTC())
def test_learn_grammar(input_dir, prj_dir, left_wall, period, context, word_space, \
dim_reduction, clustering, grammar_rules, \
verbose = 'none'):
# Directory to read parse files (1 to many input files)
#-input_dir = module_path + '/data/POC_Turtle/mst_fixed_manually/'
# Paths to store learned categories, learned dictionary Link Grammar file
if check_dir(prj_dir, create=True, verbose='none'):
cat_path = prj_dir # Path to store learned categories
dict_path = prj_dir # Path to store learned dictionary Link Grammar file
# Settings:
parse_mode = 'given' # 'given' (default) / 'explosive' (next)
#^left_wall = 'LEFT-WALL' # '','none' / 'ABC' - replace ###LEFT-WALL### with ABC
#^period = True # use period in links learning: True/False
#^context = 1 # 1: connectors / 0: 'words' /
#^context = 2 # 2,3...: disjuncts with limited number of links
window = 'mst' # 'mst' / reserved options for «explosive» parsing
weighting = 'ppmi' # 'ppmi' / future options
group = True # group items after link parsing, sum counts
#^word_space = 'vectors' # 'vectors' - dimensionality reduction with SVM
#^word_space = 'discrete' # 'discrete' - no dimensionality reduction
dim_max = 100 # max vector space dimensionality
sv_min = 0.1 # minimal singular value (fraction of the max value)
#^dim_reduction = 'svm' # 'svm' / 'none' (discrete word_space, group)
#^dim_reduction = 'none'
#^clustering = 'kmeans' # 'kmeans' / 'group' / future options
#^clustering = 'group' # grouping identical lexical entries
cluster_range = (2,48,1) # min, max, step
cluster_criteria = 'silhouette'
#-cluster_level = 0.9 # level = 0, 1, 0.-0.99..: 0 - max number of clusters
cluster_level = 1 # 2018-04-21: strict max
generalization = 'off' # 'off' / future options: 'cosine', ...
merge = 0.8 # merge clusters with similarity > this 'merge' criteria
aggregate = 0.2 # agglomerate clusters with similarity > this criteria
#^grammar_rules = 2 # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?)
#^verbose='mid' # 'none', 'min', 'mid', 'max'
lg_rules = learn_grammar(input_dir, cat_path, dict_path, tmpath, verbose, \
parse_mode, left_wall, period, context, window, weighting, group, \
word_space, dim_max, sv_min, dim_reduction, \
clustering, cluster_range, cluster_criteria, cluster_level,
generalization, merge, aggregate, grammar_rules)
return lg_rules # .dict ⇒ string
print(UTC())
left_wall = 'LEFT-WALL'
period = True
verbose = 'none'
batch_dir = out_dir + 'POC-Turtle-LEFT-WALL+period/'
print('Output data directory (batch_dir):', batch_dir)
context = 1 # 1: connectors / 2: disjuncts /
word_space = 'vectors' # 'vectors' - dimensionality reduction with SVM
dim_reduction = 'svm' # 'svm' / 'none' (discrete word_space, group)
clustering = 'kmeans' # 'kmeans' / 'group' / future options
generalization = 'off' # 'off' / future options: 'cosine', ...
grammar_rules = 2 # 1: connectors / 2 - disjuncts
verbose = 'mid'
subdir = 'connectors-DRK-disjuncts/'
prj_dir = batch_dir + 'MST_fixed_manually/' + subdir
#-print(prj_dir)
lg_rules = test_learn_grammar(input_dir, prj_dir, left_wall, period, \
context, word_space, dim_reduction, clustering, grammar_rules, verbose)
print('LG rules file saved to ..' + lg_rules.split('\n')[-1][63:])
print(lg_rules)
left_wall = 'LEFT-WALL'
period = True
context = 1
grammar_rules = 2
word_space = 'vectors'
dim_reduction = 'svm'
clustering = 'kmeans'
generalization = 'off'
verbose = 'none'
print('Link Grammar rule files saved to\n' + batch_dir[35:] + ':')
for d in dirs:
for i,context in enumerate(['connectors']):
print('')
for j,rules in enumerate(['connectors', 'disjuncts']):
prj_dir = batch_dir + d[d.rfind('/')+1:] + '/'+context+'-DRK-'+rules+'/'
lg_rules = test_learn_grammar(d, prj_dir, left_wall, period, i+1, \
word_space, dim_reduction, clustering, j+1, verbose)
print('.'+lg_rules.split('\n')[-1][124:])
Other datasets cause SVM error
d = input_batch+'MST_fixed_manually'
i = 2
context = 'disjuncts'
verbose = 'none'
print('Link Grammar rule files saved to\n' + batch_dir[35:] + ':')
for j,rules in enumerate(['connectors', 'disjuncts']):
prj_dir = batch_dir + d[d.rfind('/')+1:] + '/'+context+'-DRK-'+rules+'/'
lg_rules = test_learn_grammar(d, prj_dir, left_wall, period, i+1, \
word_space, dim_reduction, clustering, j+1, verbose)
print('.'+lg_rules.split('\n')[-1][124:])
context = 2 # 1: connectors / 2+: 'dusjuncts'
word_space = 'discrete' # grouping identical context sets
dim_reduction = 'none' # grouping identical context sets
clustering = 'group' # grouping identical context sets
grammar_rules = 2 # 1: connectors / 2+: 'dusjuncts'
subdir = 'disjuncts-ILE-disjuncts/'
verbose = 'mid'
prj_dir = batch_dir + 'MST_fixed_manually/' + subdir
lg_rules = test_learn_grammar(input_dir, prj_dir, left_wall, period, \
context, word_space, dim_reduction, clustering, grammar_rules, verbose)
print('LG rules file saved to ..' + lg_rules.split('\n')[-1][65:])
print(lg_rules)
verbose = 'none'
print('Link Grammar rule files saved to\n' + batch_dir[35:] + ':')
for d in dirs:
#-print(d)
prj_dir = batch_dir + d[d.rfind('/')+1:] + '/' + subdir
#-print(prj_dir)
lg_rules = test_learn_grammar(d, prj_dir, left_wall, period, i+1, \
word_space, dim_reduction, clustering, j+1, verbose)
print('.'+lg_rules.split('\n')[-1][124:])
left_wall = ''
period = True
word_space = 'vectors'
dim_reduction = 'svm'
clustering = 'kmeans'
verbose = 'none'
batch_dir = out_dir + 'POC-Turtle-no-LEFT-WALL/'
print('Link Grammar rule files saved to\n' + batch_dir[35:] + ':\n')
for d in dirs:
for i,context in enumerate(['connectors', 'disjuncts']):
for j,rules in enumerate(['connectors', 'disjuncts']):
prj_dir = batch_dir + d[d.rfind('/')+1:] + '/'+context+'-DRK-'+rules+'/'
lg_rules = test_learn_grammar(d, prj_dir, left_wall, period, i+1, \
word_space, dim_reduction, clustering, j+1, verbose)
print('.'+lg_rules.split('\n')[-1][120:])
ile_dir = batch_dir + d[d.rfind('/')+1:] + '/'+'disjuncts-ILE-disjuncts/'
lg_rules = test_learn_grammar(d, ile_dir, left_wall, period, 2, \
'discrete', 'none', 'group', 2, verbose)
print('.'+lg_rules.split('\n')[-1][120:], '\n')