Identify Entity Names

[1]:
import pandas as pd, json, numpy as np

wcfile = "../data/wiki/simplewiki-20211120/wordcount.min10.json"
countfile = "../data/wiki/simplewiki-20211120/count.min10.json"
evalfile = "evaluation/Mewsli-9/en.tsv"

# wcfile = 'wiki/nlwiki-20220301/wordcount.min2.json'
# countfile = 'wiki/nlwiki-20220301/count.min2.json'
# evalfile = 'evaluation/Mewsli-9/nl.tsv'


# Load word counts
wc = pd.read_json(wcfile, orient="index")[0].sort_values()
# Load anchor-entity counts
aec = json.load(open(countfile))
aec = pd.DataFrame([(a, e, c) for a, ec in aec.items() for e, c in ec.items()])
aec[1] = aec[1].str.replace("Q", "").astype(int)

# Load bad entities
badent = pd.read_csv("data/wikidata-20211122-disambig+list.txt", header=None)[0]
# aec = aec[~aec[1].isin(badent)]
aec = aec.set_index([0, 1])[2]
print(len(wc), len(aec))


import pandas as pd, json, numpy as np
from minimel import normalize

ev = pd.read_csv(evalfile, header=None, sep="\t")[1]
ev = ev.map(lambda x: [n for surf in json.loads(x) for n in normalize(surf)]).explode()

df = pd.DataFrame(
    {"word_count": wc, "link_count": aec.groupby(level=0).sum()}, dtype="Int64"
).dropna()
38456 42330
[2]:
df["ratio"] = np.log(df.word_count) / np.log(df.link_count)
[3]:
import tqdm
from minimel.mentions import count_name_lines

lines = dict(zip(list(range(250)), open(evalfile))).values()
eval_count = dict(
    count_name_lines(tqdm.tqdm(lines, desc="Counting eval words"), countfile)
)
Counting eval words: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 1307.94it/s]
[4]:
"""
We want a trade-off between accepting bad entities and rejecting good ones.
"""
df2 = df.copy()
df2["eval_count"] = pd.Series(eval_count).astype("Int64")
df2["eval_linked"] = df2.index.isin(ev)
df2.dropna(inplace=True)
df2.sort_values("ratio", inplace=True)
df2
display(
    df2.plot.scatter(
        x="ratio",
        y="eval_count",
        c=df2["eval_linked"].map(lambda x: "g" if x else "r"),
        logy=True,
    )
)

s = pd.DataFrame(
    {
        "false positives": (df2["eval_count"] * ~df2["eval_linked"]).cumsum(),
        "false negatives": (df2["eval_count"] * df2["eval_linked"])[::-1].cumsum()[
            ::-1
        ],
        "true positives": (df2["eval_count"] * df2["eval_linked"]).cumsum(),
        "ratio": df2["ratio"],
    }
)
s["precision"] = s["true positives"] / (s["true positives"] + s["false positives"])
s["recall"] = s["true positives"] / (s["true positives"] + s["false negatives"])
s["f1"] = 2 * (s["precision"] * s["recall"]) / (s["precision"] + s["recall"])
s.set_index("ratio")[["precision", "recall", "f1"]].plot.line()
<Axes: xlabel='ratio', ylabel='eval_count'>
[4]:
<Axes: xlabel='ratio'>
../_images/notebooks_identify_names_4_2.png
../_images/notebooks_identify_names_4_3.png
[7]:
from IPython.display import HTML


def display_row(captioned_dataframes):
    return HTML(
        f"""<div style="display:flex; flex-direction: row;">
        {"&nbsp;&nbsp;".join(df.style.set_caption(c)._repr_html_() for c,df in captioned_dataframes.items())}
    </div>"""
    )


f1 = s.set_index("ratio")["f1"].astype('float').fillna(0)
cutoff = f1[abs(f1 - f1.max()) < 0.05].index[0]
print("Threshold:", cutoff)
print(f"keeping", len(df[df.ratio < cutoff]), "; dropping", len(df[df.ratio > cutoff]))

drop_ev = set(ev) & set(df[df.ratio > cutoff].index)
print("dropping eval names:", len(drop_ev))
print(drop_ev)

display_row(
    {
        "False Positives": df2[~df2["eval_linked"] & (df2["ratio"] < cutoff)]
        .sort_values("eval_count")[::-1]
        .head(20),
        "False Negatives": df2[df2["eval_linked"] & (df2["ratio"] > cutoff)]
        .sort_values("eval_count")[::-1]
        .head(20),
    }
)
Threshold: 1.951297634655119
keeping 36813 ; dropping 1642
dropping eval names: 261
{'2 december', 'patch', 'iraqi', 'target', 'maria', '26 april', 'first', 'judgment', 'dogs', 'tate', 'paralympics', 'reagan', 'kilometers', 'east coast', 'politically', 'reboot', 'unity', 'decatur', 'fusion', 'practice', 'association', 'teachers', 'english-language', 'south american', 'shadow', 'berlin, germany', 'de', 'wheelchair basketball', 'it', 'laurel', 'landmarks', 'the age', 'friendly', 'james', 'reduced', '25 june', 'out', 'civilians', 'jack', 'crustaceans', 'parents', 'prime', 'nelson', 'freestyle', 'in', 'awake', 'elementary', 'ace', 'buses', 'clinton', 'ron', 'red river', 'clouds', 'beta', 'communications', 'inches', 'kilometres', 'scarecrow', 'registered', 'guardian', 'values', 'dudley', 'pigs', 'licensing', 'shock', 's', 'cheyenne', 'offensive', 'bo', 'respect', 'minor', 'apples', 'the moon', 'golfer', 'irving', 'daniel', 'john', 'kenyan', 'uprising', 'lynn', 'heinz', 'left', 'scientists', 'metres', '25 september', 'run', 'flash', 'draft', 'meeting', 'bolivian', 'the game', 'saint john', 'agent', 'swift', 'astronomers', 'patrick', 'present', 'resolution', 'syrian', 'hope', 'focus', 'trophy', 'tornadoes', 'neutrality', 'png', 'europeans', 'concord', 'ministers', 'father', 'sharks', 'sri lankan', 'drivers', 'best actress', 'public service', 'northwest', 'north korean', 'wells', 'kilograms', 'this', 'commission', 'lost', 'core', 'lp', 'the left', 'acres', 'jackson', 'angels', 'apes', 'corona', 'saints', 'augusta', 'masters', 'convention', 'medium', 'w', 'unions', 'value', 'evening', 'eight', 'duty', 'refugees', 'mark', 'who', 'notre dame', 'believe', 'wayne', 'british house of commons', 'converted', 'lifetime', 'staff', 'thomas', 'leadership', 'manor', 'triple crown', 'may', 'nico', 'farmers', 'clyde', 'taxes', 'fairy tales', 'ted', 'bishops', 'freud', '30 november', 'create', 'lighting', 'svg', 'novels', '21 march', 'tortured', 'today', 'colombian', 'x', 'moe', 'led', 'malaysian', 'partnership', 'jefferson', 'tie', 'trained', 'seven', 'capitals', 'southern', 'rabbits', 'short', 'museums', 'arrested', 'physicists', 'ministry', 'editorial', 'runner', 'mg', 'presidential election', 'bullets', 'criticized', 'retailer', 'sale', 'protests', 'psychologists', 'general assembly', 'b.c.', 'trailer', 'presidency', 'straight', 'edison', 'crash', 'divisions', 'the edge', 'roosevelt', 'stanford', 'al', 'tna', 'oceans', 'dolphins', 'hearts', 'show', 'algerian', 'path', 'constitutional', 'vista', 'council', 'loop', 'manuscripts', 'miles', 'opposition', 'riding', 'civil', 'runs', 'anc', 'wife', 'blacks', 'spiders', 'helicopters', 'bangladeshi', 'elections', 'evolutionary', 'run out', 'cornell', 'the white house', 'paul', 'engagement', 'false', 'style', 'elizabeth', 'complex', 'id', 'outbreak', 'speak', 'destiny', 'again', 'peter', 'north american', 'firefighters', 'frederick', 'lying', 'if', 'study', 'moderate', 'holly', 'websites', 'dollars'}
[7]:
False Positives
  word_count link_count ratio eval_count eval_linked
year 22166 454 1.635527 140 False
can 50542 337 1.860894 111 False
state 19243 4312 1.178721 80 False
day 14248 325 1.653641 64 False
party 7020 142 1.787092 64 False
u.s. 4676 2064 1.107148 63 False
world 19029 524 1.573702 63 False
group 17128 269 1.742444 59 False
second 15927 190 1.844049 51 False
country 12390 4411 1.123070 49 False
million 9756 615 1.430423 46 False
fire 4523 570 1.326412 44 False
public 5622 220 1.600860 43 False
interview 957 116 1.443920 43 False
man 9036 169 1.775663 42 False
uk 5383 947 1.253557 42 False
week 2683 75 1.828538 39 False
question 1430 62 1.760405 36 False
set 10241 176 1.785936 35 False
information 5795 329 1.494939 34 False
  
False Negatives
  word_count link_count ratio eval_count eval_linked
in 882984 193 2.601533 1855 True
s 144874 364 2.015145 692 True
it 237945 43 3.291448 495 True
this 95628 158 2.265285 383 True
who 44973 38 2.945310 223 True
first 60542 13 4.292908 140 True
if 27312 25 3.173493 119 True
out 19446 63 2.383557 116 True
today 6340 36 2.443027 74 True
may 20704 132 2.035324 51 True
al 4833 26 2.603736 44 True
left 11344 75 2.162471 31 True
again 7952 17 3.169962 29 True
show 11522 70 2.201253 26 True
run 5137 13 3.331147 26 True
believe 3406 14 3.081894 24 True
elections 1471 31 2.123973 23 True
arrested 1569 13 2.868748 23 True
meeting 1389 27 2.195600 22 True
john 12391 63 2.274781 20 True
[7]:
from minimel.normalize import normalize
from minimel.vectorize import vw_tok

import json, math, tqdm, dawg

surface_weights = json.load(open(countfile))

# filter out selection
drop_tok = set(tuple(vw_tok(d)) for d in df[df.ratio > cutoff].index)
surface_weights = {
    k: v
    for k, v in surface_weights.items()
    if (tuple(vw_tok(k)) not in drop_tok) and not k[-1] == "-"
}

surface_trie = dawg.CompletionDAWG(surface_weights)
import pathlib

surface_trie.save(
    pathlib.Path(countfile).parent
    / (pathlib.Path(countfile).stem + ".salient.completiondawg")
)

Analyse Mention Detection

[15]:
l = "stille oceaan"
display(df.loc[l])
if l in aec:
    display(aec[l], l in drop_ev)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexes/base.py:3652, in Index.get_loc(self, key)
   3651 try:
-> 3652     return self._engine.get_loc(casted_key)
   3653 except KeyError as err:

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/_libs/index.pyx:147, in pandas._libs.index.IndexEngine.get_loc()

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/_libs/index.pyx:176, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'stille oceaan'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[15], line 2
      1 l = "stille oceaan"
----> 2 display(df.loc[l])
      3 if l in aec:
      4     display(aec[l], l in drop_ev)

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexing.py:1103, in _LocationIndexer.__getitem__(self, key)
   1100 axis = self.axis or 0
   1102 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1103 return self._getitem_axis(maybe_callable, axis=axis)

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexing.py:1343, in _LocIndexer._getitem_axis(self, key, axis)
   1341 # fall thru to straight lookup
   1342 self._validate_key(key, axis)
-> 1343 return self._get_label(key, axis=axis)

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexing.py:1293, in _LocIndexer._get_label(self, label, axis)
   1291 def _get_label(self, label, axis: AxisInt):
   1292     # GH#5567 this will fail if the label is not present in the axis.
-> 1293     return self.obj.xs(label, axis=axis)

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/generic.py:4095, in NDFrame.xs(self, key, axis, level, drop_level)
   4093             new_index = index[loc]
   4094 else:
-> 4095     loc = index.get_loc(key)
   4097     if isinstance(loc, np.ndarray):
   4098         if loc.dtype == np.bool_:

File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexes/base.py:3654, in Index.get_loc(self, key)
   3652     return self._engine.get_loc(casted_key)
   3653 except KeyError as err:
-> 3654     raise KeyError(key) from err
   3655 except TypeError:
   3656     # If we have a listlike key, _check_indexing_error will raise
   3657     #  InvalidIndexError. Otherwise we fall through and re-raise
   3658     #  the TypeError.
   3659     self._check_indexing_error(key)

KeyError: 'stille oceaan'
[10]:
surface_weights["stille oceaan"]
[10]:
{'Q184425': 2, 'Q98': 1274}
[77]:
def get_matches(surface_trie, text, language=None):
    for normtext in normalize(text, language=language):
        normtoks = vw_tok(normtext)
        for i, tok in enumerate(normtoks):
            for comp in surface_trie.keys(tok):
                comp_toks = vw_tok(comp)
                if normtoks[i : i + len(comp_toks)] == comp_toks:
                    yield comp


import pandas as pd

data = pd.read_csv("evaluation/Mewsli-9/nl.tsv", sep="\t", nrows=10, header=None)
data[1] = data[1].map(json.loads)
_, mention_ent, text = data.iloc[6]
print(text)
print(mention_ent)


for match in get_matches(surface_trie, text):
    # NIL match
    weights = surface_weights[match]
    print(match, weights)
    # for label in vw_label_lines(weights, -1, comp, ent_feats):
    #     labels.append(label)
Zware aardbeving voor de kust in zuiden Mexico  8 september 2017   Zware aardbeving 8.2 voor de kust in zuiden Mexico  De zuidkust van Mexico is omstreeks middernacht plaatselijke tijd getroffen door een aardbeving. De Mexicaanse seismologische dienst zegt dat de beving een kracht van 8.2 op de schaal van Richter had. Het centrum was in de Stille Oceaan, op 70 kilometer diepte, zo'n 100 kilometer ten zuidwesten van Pijijiapan. De meeste doden vielen in Oaxaca en het aangrenzende Chiapas. Ook in buurland Guatemala heeft de beving een dodelijk slachtoffer geëist.  De aardbeving was ook te voelen in Mexico-Stad, waar mensen in paniek de straat op renden. De aardbeving zorgde even voor een grootschalige stroomuitval bij 1 miljoen mensen, die snel weer kon worden verholpen. De beving veroorzaakte ook een lichte tsunami.  Een dergelijke krachtige aardbeving komt gemiddeld één keer per jaar voor. De vorige van dit kaliber was in 2015, in Chili. De laatste zware aardbeving in Mexico (waarbij vele duizenden doden vielen) was in 1985. Deze nieuwe beving is echter volgens president Nieto de krachtigste in Mexico in 100 jaar.  Mexicaans president Enrique Peña Nieto heeft daags nadien drie dagen van nationale rouw afgekondigd.  Het officiële dodental staat nu op 65. Vooral in Juchitan is veel schade aan woningen.  Bronnen.  Zelf schrijven? Hoe schrijf ik een artikel?
{'Oaxaca': 34110, 'Chiapas': 60123, 'Guatemala': 774, 'Mexico-Stad': 1489, 'in 1985': 1798567}
zware aardbeving {'Q214866': 2, 'Q43777': 15, 'Q151835': 2, 'Q211386': 3, 'Q1798567': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
8 september {'Q2850': 1588}
8½ {'Q12018': 22}
zware aardbeving {'Q214866': 2, 'Q43777': 15, 'Q151835': 2, 'Q211386': 3, 'Q1798567': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
8½ {'Q12018': 22}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
middernacht {'Q36402': 104}
plaatselijke tijd {'Q6940': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
mexicaanse {'Q96': 2011, 'Q581921': 2, 'Q764690': 2, 'Q207965': 4}
seismologische {'Q83371': 17}
beving {'Q1903316': 25}
8½ {'Q12018': 22}
de schaal van richter {'Q38768': 6}
schaal van richter {'Q38768': 719}
richter {'Q2777659': 54, 'Q81240': 4, 'Q2874233': 13, 'Q38768': 7, 'Q224831': 3}
stille oceaan {'Q184425': 2, 'Q98': 1274}
100 kilometer {'Q1847570': 5}
oaxaca {'Q13906835': 3, 'Q131429': 71, 'Q34110': 383, 'Q345582': 2}
chiapas {'Q12187096': 13, 'Q1426348': 16, 'Q60123': 394}
guatemala {'Q1536706': 5, 'Q844729': 3, 'Q695660': 33, 'Q3667955': 3, 'Q270168': 188, 'Q774': 2109}
beving {'Q1903316': 25}
slachtoffer {'Q1851760': 351, 'Q181600': 6}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
voelen {'Q205555': 14, 'Q328835': 16}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
mexico stad {'Q1489': 23}
mexico-stad {'Q1489': 2470, 'Q665894': 3, 'Q927513': 2}
paniek {'Q696490': 3, 'Q208450': 45, 'Q2788113': 5}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
stroomuitval {'Q828827': 45}
beving {'Q1903316': 25}
tsunami {'Q8070': 465, 'Q36204': 6, 'Q2169285': 6, 'Q1211774': 4, 'Q130754': 22, 'Q15042655': 21}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
kaliber {'Q170417': 393, 'Q3812588': 2, 'Q256690': 14}
chili {'Q147131': 2, 'Q863794': 2, 'Q683744': 2, 'Q165199': 3, 'Q1896262': 11, 'Q589856': 7, 'Q143856': 2, 'Q2342527': 2, 'Q172025': 757, 'Q2105105': 10, 'Q393170': 2, 'Q602656': 2, 'Q261405': 7, 'Q298': 4000, 'Q1072889': 2, 'Q755107': 2, 'Q15293622': 12, 'Q606832': 9}
zware aardbeving {'Q214866': 2, 'Q43777': 15, 'Q151835': 2, 'Q211386': 3, 'Q1798567': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
beving {'Q1903316': 25}
nieto {'Q251846': 86}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
mexicaans {'Q82112': 2, 'Q96': 2266}
enrique {'Q552058': 12, 'Q47122': 2, 'Q1803019': 2, 'Q2290535': 2}
enrique peña {'Q296741': 5}
enrique peña nieto {'Q296741': 36}
peña {'Q3313041': 8, 'Q417545': 4, 'Q2497712': 6, 'Q6125576': 13, 'Q532103': 12}
nieto {'Q251846': 86}
dagen van nationale rouw {'Q1127742': 16}
nationale rouw {'Q13878715': 2, 'Q1127742': 18}
rouw {'Q2450206': 2, 'Q750652': 157}
afgekondigd {'Q446780': 2}
dodental {'Q1435473': 3}
[ ]:

[75]:
df2.loc["op de rails"]
[75]:
wc            1.0
aec          59.0
ratio         0.0
ev_count      2.0
ev_linked     0.0
Name: op de rails, dtype: Float64
[5]:
from minimel.vectorize import vw
import pathlib

fname = "wiki/nlwiki-20220301/experiments/clean-q0.25.json"
vw(
    ['id\t{"amsterdam": 9899}\tik ging naar amsterdam en Rotterdam'],
    pathlib.Path(fname),
)
[5]:
['shared |s ik ging naar amsterdam en rotterdam',
 '9899:0 |l amsterdam=9899 ',
 '727:1 |l amsterdam=727 ',
 '478771:1 |l amsterdam=478771 ',
 '50719:1 |l amsterdam=50719 ',
 '2060132:1 |l amsterdam=2060132 ',
 '214341:1 |l amsterdam=214341 ',
 '194215:1 |l amsterdam=194215 ',
 '26674641:1 |l amsterdam=26674641 ',
 '1127380:1 |l amsterdam=1127380 ',
 '9694:1 |l amsterdam=9694 ',
 '2691069:1 |l amsterdam=2691069 ',
 '2393888:1 |l amsterdam=2393888 ',
 '478785:1 |l amsterdam=478785 ',
 '1397383:1 |l amsterdam=1397383 ',
 '122781:1 |l amsterdam=122781 ',
 '959016:1 |l amsterdam=959016 ',
 '5715626:1 |l amsterdam=5715626 ',
 '2060246:1 |l amsterdam=2060246 ',
 '4748823:1 |l amsterdam=4748823 ',
 '2049529:1 |l amsterdam=2049529 ',
 '95630476:1 |l amsterdam=95630476 ',
 '683829:1 |l amsterdam=683829 ',
 '478456:1 |l amsterdam=478456 ',
 '4549041:1 |l amsterdam=4549041 ',
 '13427674:1 |l amsterdam=13427674 ',
 '585429:1 |l amsterdam=585429 ',
 '505639:1 |l amsterdam=505639 ',
 '2276292:1 |l amsterdam=2276292 ',
 '-1:0 |l ik=-1 ',
 '49404:1 |l ik=49404 ',
 '2422744:1 |l ik=2422744 ',
 '1143089:1 |l ik=1143089 ',
 '-1:0 |l en=-1 ',
 '1860:1 |l en=1860 ',
 '1377447:1 |l en=1377447 ',
 '191081:1 |l en=191081 ',
 '933:1 |l en=933 ',
 '-1:0 |l rotterdam=-1 ',
 '34370:1 |l rotterdam=34370 ',
 '2680952:1 |l rotterdam=2680952 ',
 '201284:1 |l rotterdam=201284 ',
 '801388:1 |l rotterdam=801388 ',
 '1027807:1 |l rotterdam=1027807 ',
 '166127:1 |l rotterdam=166127 ',
 '1429091:1 |l rotterdam=1429091 ',
 '783945:1 |l rotterdam=783945 ',
 '2329167:1 |l rotterdam=2329167 ',
 '931364:1 |l rotterdam=931364 ',
 '1341108:1 |l rotterdam=1341108 ',
 '299701:1 |l rotterdam=299701 ',
 '2166239:1 |l rotterdam=2166239 ',
 '2856391:1 |l rotterdam=2856391 ',
 '656807:1 |l rotterdam=656807 ',
 '633529:1 |l rotterdam=633529 ',
 '849230:1 |l rotterdam=849230 ',
 '1774596:1 |l rotterdam=1774596 ',
 '2930113:1 |l rotterdam=2930113 ',
 '1866458:1 |l rotterdam=1866458 ',
 '',
 '']