[121]:
import tqdm
import pathlib
import re
import json

try:
    import dawg
except ImportError:
    import dawg_python as dawg

root = pathlib.Path('wiki/nlwiki-20220301')
dawgfile = root / 'index_nlwiki-20220301.dawg'
index = dawg.IntDAWG()
index.load(str(dawgfile))

candidates = json.load(open('wiki/nlwiki-20220301/experiments/clean-q1.json'))
entities = set(int(c) for cs in candidates.values() for c in cs)

brackets = re.compile('\([^)]*\) ?')
phrase = re.compile('.{10,}?\. ')

descriptions = []
files = list(pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/').glob('*'))
for fname in tqdm.tqdm(files):
    prev = None
    for line in open(fname):
        page, _, text = line.split('\t', 2)
        if page != prev:
            text, _ = re.subn(brackets,'',text)
            m = re.match(phrase, text)
            if m:
                a,b = m.span()
                e = index.get(page)
                if e in entities:
                    descriptions.append((e,text[a:b]))
            prev = page

print(len(descriptions), 'descriptions')
with open('wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv', 'w') as fw:
    for e,d in descriptions:
        print(e,d, sep='\t', file=fw)
100%|██████████| 1000/1000 [00:11<00:00, 86.14it/s]
184022 descriptions
[36]:
import duckdb

import pandas as pd
import json
import pathlib
import tqdm

try:
    import dawg
except ImportError:
    import dawg_python as dawg

root = pathlib.Path('wiki/nlwiki-20220301')
dawgfile = root / 'index_nlwiki-20220301.dawg'
index = dawg.IntDAWG()
index.load(str(dawgfile))

def loads(x):
    try:
        return json.loads(x)
    except:
        return {}

root = pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/')
n = 0
texts, links = [], []
for fname in tqdm.tqdm(list(root.glob('*'))[:50]):
    df = pd.read_csv(fname, sep='\t', on_bad_lines='skip', names=['page', 'links', 'text'])
    df['page'] = df['page'].map(index.get).astype('Int64')
    df.index += n
    n += len(df)
    pairs = df.links.map(lambda x: list(loads(x).items())).explode()
    links.append( pd.DataFrame({'name':pairs.str[0], 'target':pairs.str[1]}) )
    df.drop(columns=['links'], inplace=True)
    texts.append( df )
texts = pd.concat( texts ).reset_index()
links = pd.concat( links ).reset_index()

100%|██████████| 50/50 [00:03<00:00, 14.29it/s]
[118]:
duckdb.query(
f"""
select target, count(*) c from texts, links
where texts.index=links.index and name == 'Utrecht'
group by target
order by c desc
limit 10
""").df()
[118]:
target c
0 803 900
1 776 174
2 707767 24
3 261716 16
4 2677914 5
5 221653 5
6 575655 4
7 2679365 4
8 347488 3
9 85308316 2
[122]:
import stopwordsiso as stopwords
import numpy as np

descfile = 'wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv'
e_desc = pd.read_csv(descfile, sep='\t', on_bad_lines='skip', header=None, index_col=0)[1].to_dict()

# ents = [11775750, 2595790] # Openbaar Ministerie
# ents = [29520, 148] # China
ents = [803, 776, 707767] # Utrecht

sample = duckdb.query(
f"""
select text, target from texts, links
where texts.index=links.index and target in {str(ents)}
""").df().drop_duplicates()

from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = list(stopwords.stopwords('nl'))

tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=10, max_df=.75, stop_words=stop_words)
X = tfidf.fit_transform(sample['text'])

feat_weight = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

for e in ents:
    print(e, e_desc[e])
    wdocs = tfidf.inverse_transform(X[sample['target'] == e])
    wcount = pd.Series(wdocs).explode().value_counts()
    top_feats = pd.Series({
        w:np.log1p(c)*feat_weight[w]
        for w,c in wcount.to_dict().items()
    }).sort_values()[::-1][:10]
    print(top_feats)
    print()
803 Utrecht is een stad en gemeente in Nederland en de hoofdstad van de provincie Utrecht.
werk                16.226999
den haag            16.168381
utrecht stad        16.106695
haag                16.104145
rotterdam           16.102599
universiteit        16.101863
hoogleraar          16.091765
stad utrecht        16.040620
museum              16.017313
nederlandse stad    15.991953
dtype: float64

776 Utrecht is met een landoppervlakte van 1.485 km² de op een na kleinste provincie van Nederland.
provincie utrecht                15.735772
nederlandse provincie            15.461026
nederlandse provincie utrecht    15.311498
eemnes                           15.273796
provincie                        15.219166
eemnes provincie                 15.066386
eemnes provincie utrecht         15.066386
zuid holland                     14.828159
noord holland                    14.789730
holland utrecht                  14.663517
dtype: float64

707767 Het Sticht Utrecht was het territorium waarover de bisschoppen van Utrecht in de middeleeuwen als vorst de landsheerlijkheid uitoefenden.
sticht utrecht      14.500452
bisschop            14.426491
sticht              14.397942
bisschop utrecht    14.143663
luik                12.177787
ii                  12.118064
graafschap          11.782087
iv                  11.620562
iii                 11.343168
hendrik             11.151790
dtype: float64

[ ]: