Child Directed Speech pictures 2018-08-15

Basic settings

In [1]:
import os, sys, time
import matplotlib.pyplot as plt
%matplotlib inline
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
grammar_learner_path = module_path + '/src/grammar_learner/'
if grammar_learner_path not in sys.path: sys.path.append(grammar_learner_path)
from utl import UTC
out_dir = module_path + '/output/CDS-pictures-' + str(UTC())[:10]
kwargs = {
    'left_wall'     :   ''          ,
    'period'        :   False       ,
    'dim_max'       :   100         ,
    'sv_min'        :   0.1         ,
    'clustering'    :   ('kmeans', 'kmeans++', 10),
    'cluster_range' :   (30,120,3)  ,
    'cluster_criteria': 'silhouette',
    'cluster_level' :   1           ,
    'tmpath'        :   ''          , 
    'verbose'       :   'min'       ,
    'template_path' :   'poc-turtle',
    'linkage_limit' :   1000        ,
    'categories_generalization': 'off' }
start = time.time()
print(UTC(), ':: module_path =', module_path)
2018-08-15 18:20:01 UTC :: module_path = /home/obaskov/language-learning

Cluster visialization: PCA+t-SNE, cluster labels -- random

In [2]:
def clusters2tsne(line, module_path, out_dir, **kwargs):
    from pqa_table  import params
    from learner    import learn_grammar
    from read_files import check_dir, check_mst_files
    from pparser    import files2links
    from hyperwords import vector_space_dim, pmisvd
    from clustering import best_clusters, group_links
    import random
    from sklearn.decomposition import PCA
    from sklearn.manifold import TSNE

    kwargs['rules_generalization'] = 'jaccard' if line[-1] in ['rules','both'] else ''
    kwargs['categories_generalization'] = 'jaccard' if line[-1] in ['cats','both'] else ''
    corpus = line[1]
    dataset = line[2]
    input_parses, oc, og = params(corpus, dataset, module_path, out_dir, **kwargs)
    files, re01 = check_mst_files(input_parses, kwargs['verbose'])
    kwargs['input_files'] = files
    links, re02 = files2links(**kwargs)

    tmpath = oc + '/tmp'
    check_dir(tmpath, True, 'none')
    dim = vector_space_dim(links, tmpath, tmpath, \
                           kwargs['dim_max'], kwargs['sv_min'], kwargs['verbose'])
    vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim)
    df, silhouette, inertia = best_clusters(vdf, **kwargs)  #df = clusters pd.DataFrame
    feat_cols = [x for x in range(1,101)]
    df['label'] = df['cluster_words'].apply(lambda x: '['+str(random.choice(x))+'...]')
    pca50 = PCA(n_components=50)
    pca50result = pca50.fit_transform(df[feat_cols].values)
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_pca = tsne.fit_transform(pca50result)
    x = [xx/100.0 if xx < 100.0 else 1.0 for xx in tsne_pca[:,0]]
    x = [xx if xx > -1.0 else -1.0 for xx in x]
    y = [yy/100.0 if yy < 100.0 else 1.0 for yy in tsne_pca[:,1]]
    y = [yy if yy > -1.0 else -1.0 for yy in y]
    labels = df['label'].values
    return x,y,labels

def plot2d(x,y,labels,size=(12,9),title=''):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(12,9))
    plt.scatter(x, y)
    for i,label in enumerate(labels): plt.annotate(label, (x[i], y[i]))
    plt.title(title+', '+str(len(labels))+' clusters, PCA+tSNE')
    plt.show()

ULL Project Plan ⇒ Parses ⇒ lines 58-60

In [3]:
lines = [
    [58, 'CDS-caps-br-text+brent9mos' , 'LG-English'                     ,0,0, 'none'  ], 
    [59, 'CDS-caps-br-text+brent9mos' , 'LG-English'                     ,0,0, 'rules' ], 
    [60, 'CDS-caps-br-text+brent9mos' , 'R=6-Weight=6:R-mst-weight=+1:R' ,0,0, 'none'  ], 
    [61, 'CDS-caps-br-text+brent9mos' , 'R=6-Weight=6:R-mst-weight=+1:R' ,0,0, 'rules' ]]

Connectors-DRK-Connectors

In [4]:
%%time
%%capture
kwargs['context'] = 1
kwargs['word_space'] = 'vectors'
kwargs['clustering'] = 'kmeans'
kwargs['grammar_rules'] = 1
line = lines[0]
kwargs['cluster_range'] = (99,99,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 58 s, sys: 967 ms, total: 58.9 s
Wall time: 47.1 s
In [5]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKc')
print(UTC())
2018-08-15 18:20:49 UTC
In [6]:
%%time
%%capture
line = lines[2]
kwargs['cluster_range'] = (87,87,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 1min 2s, sys: 840 ms, total: 1min 3s
Wall time: 50.6 s
In [7]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKc')
print(UTC())
2018-08-15 18:21:39 UTC

Connectors-DRK-Disjuncts

In [8]:
%%time
%%capture
line = lines[0]
kwargs['grammar_rules'] = 2
kwargs['cluster_range'] = (99,99,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 57.3 s, sys: 868 ms, total: 58.1 s
Wall time: 46 s
In [9]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd')
print(UTC())
2018-08-15 18:22:26 UTC
In [10]:
%%time
%%capture
line = lines[1]
kwargs['cluster_range'] = (84,84,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 56.1 s, sys: 744 ms, total: 56.9 s
Wall time: 45.6 s
In [11]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd, generalized')
print(UTC())
2018-08-15 18:23:12 UTC
In [12]:
%%time
%%capture
line = lines[2]
kwargs['cluster_range'] = (65,65,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 1min, sys: 760 ms, total: 1min 1s
Wall time: 49.8 s
In [13]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd')
print(UTC())
2018-08-15 18:24:02 UTC
In [14]:
%%time
%%capture
line = lines[3]
kwargs['cluster_range'] = (65,65,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 1min 1s, sys: 776 ms, total: 1min 1s
Wall time: 50.7 s
In [15]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', cDRKd, generalized')
print(UTC())
2018-08-15 18:24:53 UTC

Disjuncts-DRK-Disjuncts

In [16]:
%%time
%%capture
line = lines[0]
kwargs['context'] = 2
kwargs['cluster_range'] = (100,100,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 56.6 s, sys: 744 ms, total: 57.3 s
Wall time: 46.9 s
In [17]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd')
print(UTC())
2018-08-15 18:25:40 UTC
In [18]:
%%time
%%capture
line = lines[1]
kwargs['cluster_range'] = (66,66,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 54.6 s, sys: 592 ms, total: 55.2 s
Wall time: 46.2 s
In [19]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd, generalized')
print(UTC())
2018-08-15 18:26:26 UTC
In [20]:
%%time
%%capture
line = lines[2]
kwargs['cluster_range'] = (100,100,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 1min 1s, sys: 752 ms, total: 1min 2s
Wall time: 51.7 s
In [21]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd')
print(UTC())
2018-08-15 18:27:18 UTC
In [22]:
%%time
%%capture
line = lines[3]
kwargs['cluster_range'] = (100,100,1)
x,y,labels = clusters2tsne(line, module_path, out_dir, **kwargs)
CPU times: user 1min 2s, sys: 892 ms, total: 1min 3s
Wall time: 52.2 s
In [23]:
plot2d(x,y,labels,(12,9),'Child Directed Speech, '+line[2]+', dDRKd, generalized')
print(UTC())
2018-08-15 18:28:11 UTC