2018-08-15
¶import os, sys, time
import matplotlib.pyplot as plt
%matplotlib inline
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
grammar_learner_path = module_path + '/src/grammar_learner/'
if grammar_learner_path not in sys.path: sys.path.append(grammar_learner_path)
from utl import UTC
out_dir = module_path + '/output/CDS-pictures-' + str(UTC())[:10]
kwargs = {
'left_wall' : '' ,
'period' : False ,
'dim_max' : 100 ,
'sv_min' : 0.1 ,
'clustering' : ('kmeans', 'kmeans++', 10),
'cluster_range' : (30,120,3) ,
'cluster_criteria': 'silhouette',
'cluster_level' : 1 ,
'tmpath' : '' ,
'verbose' : 'min' ,
'template_path' : 'poc-turtle',
'linkage_limit' : 1000 ,
'categories_generalization': 'off' }
start = time.time()
print(UTC(), ':: module_path =', module_path)
def clusters2tsne(line, module_path, out_dir, **kwargs):
from pqa_table import params
from learner import learn_grammar
from read_files import check_dir, check_mst_files
from pparser import files2links
from hyperwords import vector_space_dim, pmisvd
from clustering import best_clusters, group_links
import random
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
kwargs['rules_generalization'] = 'jaccard' if line[-1] in ['rules','both'] else ''
kwargs['categories_generalization'] = 'jaccard' if line[-1] in ['cats','both'] else ''
corpus = line[1]
dataset = line[2]
input_parses, oc, og = params(corpus, dataset, module_path, out_dir, **kwargs)
files, re01 = check_mst_files(input_parses, kwargs['verbose'])
kwargs['input_files'] = files
links, re02 = files2links(**kwargs)
tmpath = oc + '/tmp'
check_dir(tmpath, True, 'none')
dim = vector_space_dim(links, tmpath, tmpath, \
kwargs['dim_max'], kwargs['sv_min'], kwargs['verbose'])
vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim)
df, silhouette, inertia = best_clusters(vdf, **kwargs) #df = clusters pd.DataFrame
feat_cols = [x for x in range(1,101)]
df['label'] = df['cluster_words'].apply(lambda x: '['+str(random.choice(x))+'...]')
pca50 = PCA(n_components=50)
pca50result = pca50.fit_transform(df[feat_cols].values)
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_pca = tsne.fit_transform(pca50result)
x = [xx/100.0 if xx < 100.0 else 1.0 for xx in tsne_pca[:,0]]
x = [xx if xx > -1.0 else -1.0 for xx in x]
y = [yy/100.0 if yy < 100.0 else 1.0 for yy in tsne_pca[:,1]]
y = [yy if yy > -1.0 else -1.0 for yy in y]
labels = df['label'].values
return x,y,labels
def plot2d(x,y,labels,size=(12,9),title=''):
import matplotlib.pyplot as plt
plt.figure(figsize=(12,9))
plt.scatter(x, y)
for i,label in enumerate(labels): plt.annotate(label, (x[i], y[i]))
plt.title(title+', '+str(len(labels))+' clusters, PCA+tSNE')
plt.show()
lines = [
[58, 'CDS-caps-br-text+brent9mos' , 'LG-English' ,0,0, 'none' ],
[59, 'CDS-caps-br-text+brent9mos' , 'LG-English' ,0,0, 'rules' ],
[60, 'CDS-caps-br-text+brent9mos' , 'R=6-Weight=6:R-mst-weight=+1:R' ,0,0, 'none' ],
[61, 'CDS-caps-br-text+brent9mos' , 'R=6-Weight=6:R-mst-weight=+1:R' ,0,0, 'rules' ]]
%%time
%%capture
kwargs['context'] = 1
kwargs['word_space'] = 'vectors'
kwargs['clustering'] = 'kmeans'
kwargs['grammar_rules'] = 1
line = lines[0]
kwargs['cluster_range'] = (99,99,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKc')
print(UTC())
%%time
%%capture
line = lines[2]
kwargs['cluster_range'] = (87,87,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKc')
print(UTC())
%%time
%%capture
line = lines[0]
kwargs['grammar_rules'] = 2
kwargs['cluster_range'] = (99,99,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd')
print(UTC())
%%time
%%capture
line = lines[1]
kwargs['cluster_range'] = (84,84,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd, generalized')
print(UTC())
%%time
%%capture
line = lines[2]
kwargs['cluster_range'] = (65,65,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd')
print(UTC())
%%time
%%capture
line = lines[3]
kwargs['cluster_range'] = (65,65,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd, generalized')
print(UTC())
%%time
%%capture
line = lines[0]
kwargs['context'] = 2
kwargs['cluster_range'] = (100,100,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd')
print(UTC())
%%time
%%capture
line = lines[1]
kwargs['cluster_range'] = (66,66,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd, generalized')
print(UTC())
%%time
%%capture
line = lines[2]
kwargs['cluster_range'] = (100,100,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd')
print(UTC())
%%time
%%capture
line = lines[3]
kwargs['cluster_range'] = (100,100,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd, generalized')
print(UTC())