2018-04-12+
¶Static html with images -- http://88.99.210.144/data/clustering_2018/html/Pictures-2018-04-12.html
Data -- http://88.99.210.144/data/clustering_2018/Pictures-2018-04-12/
import os, sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.utl.utl import UTC
from src.utl.read_files import check_dir
print(UTC(), module_path)
prefix = '' # unised option
tmpath = module_path + '/tmp/'
check_dir(tmpath, True, 'none')
date = '2018-04-13'
verbose='mid'
def gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose='mid'):
from src.utl.read_files import check_dir
from src.grammar_learner.poc03 import learn_grammar
parse_mode = 'given' # 'given' (default) / 'explosive' (next)
left_wall = 'LEFT-WALL' # '','none' - don't use / 'LW' - replace ###LEFT-WALL### with 'LW'
period = True # use period in links learning: True/False
#_context = 1 # 1: connectors / 0: 'words' /
#_context = 2 # 2,3...: disjuncts with limited number of links
window = 'mst' # 'mst' / reserved options for «explosive» parsing
weighting = 'ppmi' # 'ppmi' / future weighting options
group = True # group items after link parsing, sum counts
#+distance = False?
word_space = 'vectors' # 'vectors' / 'discrete' - no dimensionality reduction
dim_max = 100 # max vector space dimensionality
sv_min = 0.1 # minimal singular value (fraction of the max value)
dim_reduction = 'svm' # 'svm' / 'none' (discrete word_space, group)
clustering = 'kmeans' # 'kmeans' / 'group'~'identical_entries' / future options
cluster_range = (2,48,1) # min, max, step
cluster_criteria = 'silhouette'
cluster_level = 0.9 # level = 0, 1, 0.-0.99..: 0 - max number of clusters
generalization = 'off' # 'off' / future options: 'cosine', ...
merge = 0.8 # merge clusters with similarity > this 'merge' criteria
aggregate = 0.2 # agglomerate clusters with similarity > this criteria
#_grammar_rules = 2 # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?)
#_verbose='mid' # display intermediate results: 'none', 'min', 'mid', 'max'
if check_dir(prj_dir, create=True, verbose='none'):
cat_path = prj_dir # Path to store learned categories
dict_path = prj_dir # Path to store learned dictionary Link Grammar file
lg_rules_str = learn_grammar(input_dir, cat_path, dict_path, tmpath, verbose, \
parse_mode, left_wall, period, context, window, weighting, group, \
word_space, dim_max, sv_min, dim_reduction, \
clustering, cluster_range, cluster_criteria, cluster_level,
generalization, merge, aggregate, grammar_rules)
return lg_rules_str.split('\n')[-1][2:]
print(UTC())
input_dir = module_path + '/data/POC_Turtle/MST_fixed_manually/'
batch_dir = module_path + '/output/Pictures-'+date+'/POC-Turtle/MST_fixed_manually/'
prj_dir = batch_dir + 'connectors-DRK-connectors/'
context = 1
grammar_rules = 1
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
prj_dir = batch_dir + 'disjuncts-DRK-disjuncts/'
context = 2
grammar_rules = 2
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
input_dir = module_path + '/data/POC_English_NoAmb/MST_fixed_manually/'
batch_dir = module_path + '/output/Pictures-'+date+'/POC_English_NoAmb/MST_fixed_manually/'
prj_dir = batch_dir + 'connectors-DRK-connectors/'
context = 1
grammar_rules = 1
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)
prj_dir = batch_dir + 'disjuncts-DRK-disjuncts/'
context = 2
grammar_rules = 2
gimme_pictures(input_dir, prj_dir, context, grammar_rules, verbose)