Explain or discuss the fact that ILE cluster formation takes place only with MWC=1 -- UPP Project plan.
"Gutenberg Children Books" corpus, new "LG-E-noQuotes" dataset (GC_LGEnglish_noQuotes_fullyParsed.ull),
trash filter off: min_word_count = 1
; max_sentence_length
off; Link Grammar 5.5.1.
This notebook is shared as static ILE-clustering-research-GCB-LG-E-noQuotes-2019-04-19.html.
import os, sys, time, numpy as np, pandas as pd
from collections import Counter
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path: sys.path.append(module_path)
from src.grammar_learner.utl import UTC, kwa, test_stats
from src.grammar_learner.read_files import check_dir, check_mst_files
from src.grammar_learner.write_files import list2file
from src.grammar_learner.widgets import html_table
from src.grammar_learner.preprocessing import filter_links
tmpath = module_path + '/tmp/'; check_dir(tmpath, True, 'none')
print(UTC(), ':: module_path:', module_path)
corpus = 'GCB' # 'Gutenberg-Children-Books-Caps'
dataset = 'LG-E-noQuotes'
input_parses = module_path + '/data/GCB/LG-E-noQuotes/'
kwargs = {
'left_wall' : '' ,
'period' : False ,
'context' : 2 ,
'min_word_count': 1 ,
'word_space' : 'discrete' ,
'clustering' : 'group' ,
'cluster_range' : [0] ,
'top_level' : 0.01 ,
'grammar_rules' : 2 ,
'max_disjuncts' : 1000000 ,
'stop_words' : [] ,
'tmpath' : tmpath ,
'verbose' : '+' ,
'template_path' : 'poc-turtle',
'linkage_limit' : 1000 }
rp = module_path + '/data/' + corpus + '/LG-E-noQuotes'
cp = rp # corpus path = reference_path
out_dir = module_path + '/output/' \
+ 'ILE-clustering-GCB-LG-E-noQuotes-' + str(UTC())[:10]
kwargs['output_grammar'] = out_dir
check_dir(out_dir, True)
print(UTC(), '\n', out_dir)
files, re01 = check_mst_files(input_parses, 'max')
kwargs['input_files'] = files; files
links, re02 = filter_links(files, **kwargs)
print(len(links), 'unique links (pairs of linked words in parses)')
links[['word', 'link', 'count']].head()
words = links[['word', 'count']].groupby('word').agg({'count': 'sum'}).reset_index()
print(len(words), 'unique words in links (pairs of linked words in parses)')
words.head()
{word: number_of_observations}
¶word_counts = words.set_index('word').to_dict()['count']
print(len(word_counts), 'words total,\n',
len([w for w,c in word_counts.items() if c < 2]), 'words observed only once,\n',
len([w for w,c in word_counts.items() if c == 2]), 'words observed twice,\n',
len([w for w,c in word_counts.items() if c > 2]), 'words observed more than twice')
df = links[['word', 'link', 'count']].copy()
df['disjuncts'] = [[x] for x in df['link']]
del df['link']
df = df.groupby('word').agg({'disjuncts': 'sum', 'count': 'sum'}).reset_index()
df['words'] = [[x] for x in df['word']]
del df['word']
df['disjuncts'] = df['disjuncts'].apply(lambda x: tuple(sorted(x)))
df[['words', 'disjuncts', 'count']].head()
dj_list = df['disjuncts'].tolist()
djset = set(df['disjuncts'].tolist())
print(len(djset), 'unique disjuncts')
rules = df[['words', 'disjuncts']].groupby('disjuncts')['words'].apply(sum) \
.reset_index().copy().rename(columns = {'words': 'cluster_words'})
print(len(rules), 'grammar rules after clustering disjuncts')
rules[['cluster_words', 'disjuncts']].head()
cluster_list = [c for c in rules['cluster_words'].tolist() if len(c) > 1]
print(len(cluster_list), 'clusters contain 2 or more words;',
len(rules) - len(cluster_list), '"clusters" are single-word')
print('Random 12 clusters:\n')
for c in cluster_list[:12]: print(c)
cluster_sizes = Counter([len(c) for c in cluster_list])
print('Cluster sizes observed more than once:')
display(html_table([['Cluster size', 'Number of clusters']] +
sorted([[s,n] for s,n in cluster_sizes.items() if n > 1],
key = lambda x : x[1], reverse = True)))
print('Cluster sizes observed only once:\n', sorted([s for s,n in cluster_sizes.items() if n < 2]))
# unique words in clusters:
words_in_clusters = set([w for c in cluster_list for w in c])
print(len(words_in_clusters), 'unique words in clusters',
'-- 100 randomly chosen samples:\n\n', list(words_in_clusters)[:100])
# Numbers of cluster member words observations in the whole corpus
wcs = set([word_counts[w] for w in words_in_clusters]); wcs
clustered_word_counts = Counter([word_counts[w] for w in words_in_clusters])
clustered_word_counts
1: 3416
means 3416 words represented in clusters are observed once in the corpus,
2: 289
-- 289 words are observed twice, ... 2 most frequent clustered words are observed 9 times
print(str(int(round(clustered_word_counts[1]/len(words_in_clusters)*100,0))) +
'% words in clusters are observed only once in the whole input corpus:\n',
clustered_word_counts[1], 'once observed words of',
len(words_in_clusters), 'total unique words in clusters.')
clustered_words_counts = {w: word_counts[w] for c in cluster_list for w in c}
frequent_words_counts = {w:c for w,c in clustered_words_counts.items() if c > 2}
print('Number of observations of clustered words in the whole corpus',
'for words observed more than twice:\n\n', frequent_words_counts)
patterns = Counter([tuple(sorted(set([word_counts[w] for w in c]))) for c in cluster_list])
patterns
Comment: (1,5): 3
means that 3 clusters consist of words observed in the input corpus once or 5 times
once_observed_words = {w:c for w,c in word_counts.items() if c < 2}.keys()
filtered_cluster_list = [l for l in [[w for w in c if w not in once_observed_words]
for c in cluster_list] if len(l) > 1]
print(len(cluster_list) - len(filtered_cluster_list),
'clusters of', len(cluster_list),
'consist of words, observed only once in the input corpus,\n',
len(filtered_cluster_list),
'clusters of words, observed more than once in the input corpus:')
for l in filtered_cluster_list: print(l)
less_observed_words = {w:c for w,c in word_counts.items() if c < 3}.keys()
filtered_cluster_list = [l for l in [[w for w in c if w not in less_observed_words]
for c in cluster_list] if len(l) > 1]
print(len(filtered_cluster_list),
'clusters of words, observed more than twice in the input corpus:\n')
for l in filtered_cluster_list: print(l)
less_observed_words = {w:c for w,c in word_counts.items() if c < 4}.keys()
filtered_cluster_list = [l for l in [[w for w in c if w not in less_observed_words]
for c in cluster_list] if len(l) > 1]
print(len(filtered_cluster_list),
'clusters of words, observed more than 3x in the input corpus:\n')
for l in filtered_cluster_list: print(l)
Unfiltered "Gutenberg Children Books" corpus "LG-E-noQuotes" dataset contains
22067 words, of which:
"Identical Lexical Entries" clustering provides 18939 grammar rules (clusters) containg
3789 unique words, 3416 (90%) of which are observed only once in the corpus.