[121]:
import tqdm
import pathlib
import re
import json
try:
import dawg
except ImportError:
import dawg_python as dawg
root = pathlib.Path('wiki/nlwiki-20220301')
dawgfile = root / 'index_nlwiki-20220301.dawg'
index = dawg.IntDAWG()
index.load(str(dawgfile))
candidates = json.load(open('wiki/nlwiki-20220301/experiments/clean-q1.json'))
entities = set(int(c) for cs in candidates.values() for c in cs)
brackets = re.compile('\([^)]*\) ?')
phrase = re.compile('.{10,}?\. ')
descriptions = []
files = list(pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/').glob('*'))
for fname in tqdm.tqdm(files):
prev = None
for line in open(fname):
page, _, text = line.split('\t', 2)
if page != prev:
text, _ = re.subn(brackets,'',text)
m = re.match(phrase, text)
if m:
a,b = m.span()
e = index.get(page)
if e in entities:
descriptions.append((e,text[a:b]))
prev = page
print(len(descriptions), 'descriptions')
with open('wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv', 'w') as fw:
for e,d in descriptions:
print(e,d, sep='\t', file=fw)
100%|██████████| 1000/1000 [00:11<00:00, 86.14it/s]
184022 descriptions
[36]:
import duckdb
import pandas as pd
import json
import pathlib
import tqdm
try:
import dawg
except ImportError:
import dawg_python as dawg
root = pathlib.Path('wiki/nlwiki-20220301')
dawgfile = root / 'index_nlwiki-20220301.dawg'
index = dawg.IntDAWG()
index.load(str(dawgfile))
def loads(x):
try:
return json.loads(x)
except:
return {}
root = pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/')
n = 0
texts, links = [], []
for fname in tqdm.tqdm(list(root.glob('*'))[:50]):
df = pd.read_csv(fname, sep='\t', on_bad_lines='skip', names=['page', 'links', 'text'])
df['page'] = df['page'].map(index.get).astype('Int64')
df.index += n
n += len(df)
pairs = df.links.map(lambda x: list(loads(x).items())).explode()
links.append( pd.DataFrame({'name':pairs.str[0], 'target':pairs.str[1]}) )
df.drop(columns=['links'], inplace=True)
texts.append( df )
texts = pd.concat( texts ).reset_index()
links = pd.concat( links ).reset_index()
100%|██████████| 50/50 [00:03<00:00, 14.29it/s]
[118]:
duckdb.query(
f"""
select target, count(*) c from texts, links
where texts.index=links.index and name == 'Utrecht'
group by target
order by c desc
limit 10
""").df()
[118]:
| target | c | |
|---|---|---|
| 0 | 803 | 900 |
| 1 | 776 | 174 |
| 2 | 707767 | 24 |
| 3 | 261716 | 16 |
| 4 | 2677914 | 5 |
| 5 | 221653 | 5 |
| 6 | 575655 | 4 |
| 7 | 2679365 | 4 |
| 8 | 347488 | 3 |
| 9 | 85308316 | 2 |
[122]:
import stopwordsiso as stopwords
import numpy as np
descfile = 'wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv'
e_desc = pd.read_csv(descfile, sep='\t', on_bad_lines='skip', header=None, index_col=0)[1].to_dict()
# ents = [11775750, 2595790] # Openbaar Ministerie
# ents = [29520, 148] # China
ents = [803, 776, 707767] # Utrecht
sample = duckdb.query(
f"""
select text, target from texts, links
where texts.index=links.index and target in {str(ents)}
""").df().drop_duplicates()
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = list(stopwords.stopwords('nl'))
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=10, max_df=.75, stop_words=stop_words)
X = tfidf.fit_transform(sample['text'])
feat_weight = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))
for e in ents:
print(e, e_desc[e])
wdocs = tfidf.inverse_transform(X[sample['target'] == e])
wcount = pd.Series(wdocs).explode().value_counts()
top_feats = pd.Series({
w:np.log1p(c)*feat_weight[w]
for w,c in wcount.to_dict().items()
}).sort_values()[::-1][:10]
print(top_feats)
print()
803 Utrecht is een stad en gemeente in Nederland en de hoofdstad van de provincie Utrecht.
werk 16.226999
den haag 16.168381
utrecht stad 16.106695
haag 16.104145
rotterdam 16.102599
universiteit 16.101863
hoogleraar 16.091765
stad utrecht 16.040620
museum 16.017313
nederlandse stad 15.991953
dtype: float64
776 Utrecht is met een landoppervlakte van 1.485 km² de op een na kleinste provincie van Nederland.
provincie utrecht 15.735772
nederlandse provincie 15.461026
nederlandse provincie utrecht 15.311498
eemnes 15.273796
provincie 15.219166
eemnes provincie 15.066386
eemnes provincie utrecht 15.066386
zuid holland 14.828159
noord holland 14.789730
holland utrecht 14.663517
dtype: float64
707767 Het Sticht Utrecht was het territorium waarover de bisschoppen van Utrecht in de middeleeuwen als vorst de landsheerlijkheid uitoefenden.
sticht utrecht 14.500452
bisschop 14.426491
sticht 14.397942
bisschop utrecht 14.143663
luik 12.177787
ii 12.118064
graafschap 11.782087
iv 11.620562
iii 11.343168
hendrik 11.151790
dtype: float64
[ ]: