Identify Entity Names
[1]:
import pandas as pd, json, numpy as np
wcfile = "../data/wiki/simplewiki-20211120/wordcount.min10.json"
countfile = "../data/wiki/simplewiki-20211120/count.min10.json"
evalfile = "evaluation/Mewsli-9/en.tsv"
# wcfile = 'wiki/nlwiki-20220301/wordcount.min2.json'
# countfile = 'wiki/nlwiki-20220301/count.min2.json'
# evalfile = 'evaluation/Mewsli-9/nl.tsv'
# Load word counts
wc = pd.read_json(wcfile, orient="index")[0].sort_values()
# Load anchor-entity counts
aec = json.load(open(countfile))
aec = pd.DataFrame([(a, e, c) for a, ec in aec.items() for e, c in ec.items()])
aec[1] = aec[1].str.replace("Q", "").astype(int)
# Load bad entities
badent = pd.read_csv("data/wikidata-20211122-disambig+list.txt", header=None)[0]
# aec = aec[~aec[1].isin(badent)]
aec = aec.set_index([0, 1])[2]
print(len(wc), len(aec))
import pandas as pd, json, numpy as np
from minimel import normalize
ev = pd.read_csv(evalfile, header=None, sep="\t")[1]
ev = ev.map(lambda x: [n for surf in json.loads(x) for n in normalize(surf)]).explode()
df = pd.DataFrame(
{"word_count": wc, "link_count": aec.groupby(level=0).sum()}, dtype="Int64"
).dropna()
38456 42330
[2]:
df["ratio"] = np.log(df.word_count) / np.log(df.link_count)
[3]:
import tqdm
from minimel.mentions import count_name_lines
lines = dict(zip(list(range(250)), open(evalfile))).values()
eval_count = dict(
count_name_lines(tqdm.tqdm(lines, desc="Counting eval words"), countfile)
)
Counting eval words: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 1307.94it/s]
[4]:
"""
We want a trade-off between accepting bad entities and rejecting good ones.
"""
df2 = df.copy()
df2["eval_count"] = pd.Series(eval_count).astype("Int64")
df2["eval_linked"] = df2.index.isin(ev)
df2.dropna(inplace=True)
df2.sort_values("ratio", inplace=True)
df2
display(
df2.plot.scatter(
x="ratio",
y="eval_count",
c=df2["eval_linked"].map(lambda x: "g" if x else "r"),
logy=True,
)
)
s = pd.DataFrame(
{
"false positives": (df2["eval_count"] * ~df2["eval_linked"]).cumsum(),
"false negatives": (df2["eval_count"] * df2["eval_linked"])[::-1].cumsum()[
::-1
],
"true positives": (df2["eval_count"] * df2["eval_linked"]).cumsum(),
"ratio": df2["ratio"],
}
)
s["precision"] = s["true positives"] / (s["true positives"] + s["false positives"])
s["recall"] = s["true positives"] / (s["true positives"] + s["false negatives"])
s["f1"] = 2 * (s["precision"] * s["recall"]) / (s["precision"] + s["recall"])
s.set_index("ratio")[["precision", "recall", "f1"]].plot.line()
<Axes: xlabel='ratio', ylabel='eval_count'>
[4]:
<Axes: xlabel='ratio'>
[7]:
from IPython.display import HTML
def display_row(captioned_dataframes):
return HTML(
f"""<div style="display:flex; flex-direction: row;">
{" ".join(df.style.set_caption(c)._repr_html_() for c,df in captioned_dataframes.items())}
</div>"""
)
f1 = s.set_index("ratio")["f1"].astype('float').fillna(0)
cutoff = f1[abs(f1 - f1.max()) < 0.05].index[0]
print("Threshold:", cutoff)
print(f"keeping", len(df[df.ratio < cutoff]), "; dropping", len(df[df.ratio > cutoff]))
drop_ev = set(ev) & set(df[df.ratio > cutoff].index)
print("dropping eval names:", len(drop_ev))
print(drop_ev)
display_row(
{
"False Positives": df2[~df2["eval_linked"] & (df2["ratio"] < cutoff)]
.sort_values("eval_count")[::-1]
.head(20),
"False Negatives": df2[df2["eval_linked"] & (df2["ratio"] > cutoff)]
.sort_values("eval_count")[::-1]
.head(20),
}
)
Threshold: 1.951297634655119
keeping 36813 ; dropping 1642
dropping eval names: 261
{'2 december', 'patch', 'iraqi', 'target', 'maria', '26 april', 'first', 'judgment', 'dogs', 'tate', 'paralympics', 'reagan', 'kilometers', 'east coast', 'politically', 'reboot', 'unity', 'decatur', 'fusion', 'practice', 'association', 'teachers', 'english-language', 'south american', 'shadow', 'berlin, germany', 'de', 'wheelchair basketball', 'it', 'laurel', 'landmarks', 'the age', 'friendly', 'james', 'reduced', '25 june', 'out', 'civilians', 'jack', 'crustaceans', 'parents', 'prime', 'nelson', 'freestyle', 'in', 'awake', 'elementary', 'ace', 'buses', 'clinton', 'ron', 'red river', 'clouds', 'beta', 'communications', 'inches', 'kilometres', 'scarecrow', 'registered', 'guardian', 'values', 'dudley', 'pigs', 'licensing', 'shock', 's', 'cheyenne', 'offensive', 'bo', 'respect', 'minor', 'apples', 'the moon', 'golfer', 'irving', 'daniel', 'john', 'kenyan', 'uprising', 'lynn', 'heinz', 'left', 'scientists', 'metres', '25 september', 'run', 'flash', 'draft', 'meeting', 'bolivian', 'the game', 'saint john', 'agent', 'swift', 'astronomers', 'patrick', 'present', 'resolution', 'syrian', 'hope', 'focus', 'trophy', 'tornadoes', 'neutrality', 'png', 'europeans', 'concord', 'ministers', 'father', 'sharks', 'sri lankan', 'drivers', 'best actress', 'public service', 'northwest', 'north korean', 'wells', 'kilograms', 'this', 'commission', 'lost', 'core', 'lp', 'the left', 'acres', 'jackson', 'angels', 'apes', 'corona', 'saints', 'augusta', 'masters', 'convention', 'medium', 'w', 'unions', 'value', 'evening', 'eight', 'duty', 'refugees', 'mark', 'who', 'notre dame', 'believe', 'wayne', 'british house of commons', 'converted', 'lifetime', 'staff', 'thomas', 'leadership', 'manor', 'triple crown', 'may', 'nico', 'farmers', 'clyde', 'taxes', 'fairy tales', 'ted', 'bishops', 'freud', '30 november', 'create', 'lighting', 'svg', 'novels', '21 march', 'tortured', 'today', 'colombian', 'x', 'moe', 'led', 'malaysian', 'partnership', 'jefferson', 'tie', 'trained', 'seven', 'capitals', 'southern', 'rabbits', 'short', 'museums', 'arrested', 'physicists', 'ministry', 'editorial', 'runner', 'mg', 'presidential election', 'bullets', 'criticized', 'retailer', 'sale', 'protests', 'psychologists', 'general assembly', 'b.c.', 'trailer', 'presidency', 'straight', 'edison', 'crash', 'divisions', 'the edge', 'roosevelt', 'stanford', 'al', 'tna', 'oceans', 'dolphins', 'hearts', 'show', 'algerian', 'path', 'constitutional', 'vista', 'council', 'loop', 'manuscripts', 'miles', 'opposition', 'riding', 'civil', 'runs', 'anc', 'wife', 'blacks', 'spiders', 'helicopters', 'bangladeshi', 'elections', 'evolutionary', 'run out', 'cornell', 'the white house', 'paul', 'engagement', 'false', 'style', 'elizabeth', 'complex', 'id', 'outbreak', 'speak', 'destiny', 'again', 'peter', 'north american', 'firefighters', 'frederick', 'lying', 'if', 'study', 'moderate', 'holly', 'websites', 'dollars'}
[7]:
| word_count | link_count | ratio | eval_count | eval_linked | |
|---|---|---|---|---|---|
| year | 22166 | 454 | 1.635527 | 140 | False |
| can | 50542 | 337 | 1.860894 | 111 | False |
| state | 19243 | 4312 | 1.178721 | 80 | False |
| day | 14248 | 325 | 1.653641 | 64 | False |
| party | 7020 | 142 | 1.787092 | 64 | False |
| u.s. | 4676 | 2064 | 1.107148 | 63 | False |
| world | 19029 | 524 | 1.573702 | 63 | False |
| group | 17128 | 269 | 1.742444 | 59 | False |
| second | 15927 | 190 | 1.844049 | 51 | False |
| country | 12390 | 4411 | 1.123070 | 49 | False |
| million | 9756 | 615 | 1.430423 | 46 | False |
| fire | 4523 | 570 | 1.326412 | 44 | False |
| public | 5622 | 220 | 1.600860 | 43 | False |
| interview | 957 | 116 | 1.443920 | 43 | False |
| man | 9036 | 169 | 1.775663 | 42 | False |
| uk | 5383 | 947 | 1.253557 | 42 | False |
| week | 2683 | 75 | 1.828538 | 39 | False |
| question | 1430 | 62 | 1.760405 | 36 | False |
| set | 10241 | 176 | 1.785936 | 35 | False |
| information | 5795 | 329 | 1.494939 | 34 | False |
| word_count | link_count | ratio | eval_count | eval_linked | |
|---|---|---|---|---|---|
| in | 882984 | 193 | 2.601533 | 1855 | True |
| s | 144874 | 364 | 2.015145 | 692 | True |
| it | 237945 | 43 | 3.291448 | 495 | True |
| this | 95628 | 158 | 2.265285 | 383 | True |
| who | 44973 | 38 | 2.945310 | 223 | True |
| first | 60542 | 13 | 4.292908 | 140 | True |
| if | 27312 | 25 | 3.173493 | 119 | True |
| out | 19446 | 63 | 2.383557 | 116 | True |
| today | 6340 | 36 | 2.443027 | 74 | True |
| may | 20704 | 132 | 2.035324 | 51 | True |
| al | 4833 | 26 | 2.603736 | 44 | True |
| left | 11344 | 75 | 2.162471 | 31 | True |
| again | 7952 | 17 | 3.169962 | 29 | True |
| show | 11522 | 70 | 2.201253 | 26 | True |
| run | 5137 | 13 | 3.331147 | 26 | True |
| believe | 3406 | 14 | 3.081894 | 24 | True |
| elections | 1471 | 31 | 2.123973 | 23 | True |
| arrested | 1569 | 13 | 2.868748 | 23 | True |
| meeting | 1389 | 27 | 2.195600 | 22 | True |
| john | 12391 | 63 | 2.274781 | 20 | True |
[7]:
from minimel.normalize import normalize
from minimel.vectorize import vw_tok
import json, math, tqdm, dawg
surface_weights = json.load(open(countfile))
# filter out selection
drop_tok = set(tuple(vw_tok(d)) for d in df[df.ratio > cutoff].index)
surface_weights = {
k: v
for k, v in surface_weights.items()
if (tuple(vw_tok(k)) not in drop_tok) and not k[-1] == "-"
}
surface_trie = dawg.CompletionDAWG(surface_weights)
import pathlib
surface_trie.save(
pathlib.Path(countfile).parent
/ (pathlib.Path(countfile).stem + ".salient.completiondawg")
)
Analyse Mention Detection
[15]:
l = "stille oceaan"
display(df.loc[l])
if l in aec:
display(aec[l], l in drop_ev)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexes/base.py:3652, in Index.get_loc(self, key)
3651 try:
-> 3652 return self._engine.get_loc(casted_key)
3653 except KeyError as err:
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/_libs/index.pyx:147, in pandas._libs.index.IndexEngine.get_loc()
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/_libs/index.pyx:176, in pandas._libs.index.IndexEngine.get_loc()
File pandas/_libs/hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()
File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'stille oceaan'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Cell In[15], line 2
1 l = "stille oceaan"
----> 2 display(df.loc[l])
3 if l in aec:
4 display(aec[l], l in drop_ev)
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexing.py:1103, in _LocationIndexer.__getitem__(self, key)
1100 axis = self.axis or 0
1102 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1103 return self._getitem_axis(maybe_callable, axis=axis)
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexing.py:1343, in _LocIndexer._getitem_axis(self, key, axis)
1341 # fall thru to straight lookup
1342 self._validate_key(key, axis)
-> 1343 return self._get_label(key, axis=axis)
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexing.py:1293, in _LocIndexer._get_label(self, label, axis)
1291 def _get_label(self, label, axis: AxisInt):
1292 # GH#5567 this will fail if the label is not present in the axis.
-> 1293 return self.obj.xs(label, axis=axis)
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/generic.py:4095, in NDFrame.xs(self, key, axis, level, drop_level)
4093 new_index = index[loc]
4094 else:
-> 4095 loc = index.get_loc(key)
4097 if isinstance(loc, np.ndarray):
4098 if loc.dtype == np.bool_:
File ~/.conda/envs/p38/lib/python3.8/site-packages/pandas/core/indexes/base.py:3654, in Index.get_loc(self, key)
3652 return self._engine.get_loc(casted_key)
3653 except KeyError as err:
-> 3654 raise KeyError(key) from err
3655 except TypeError:
3656 # If we have a listlike key, _check_indexing_error will raise
3657 # InvalidIndexError. Otherwise we fall through and re-raise
3658 # the TypeError.
3659 self._check_indexing_error(key)
KeyError: 'stille oceaan'
[10]:
surface_weights["stille oceaan"]
[10]:
{'Q184425': 2, 'Q98': 1274}
[77]:
def get_matches(surface_trie, text, language=None):
for normtext in normalize(text, language=language):
normtoks = vw_tok(normtext)
for i, tok in enumerate(normtoks):
for comp in surface_trie.keys(tok):
comp_toks = vw_tok(comp)
if normtoks[i : i + len(comp_toks)] == comp_toks:
yield comp
import pandas as pd
data = pd.read_csv("evaluation/Mewsli-9/nl.tsv", sep="\t", nrows=10, header=None)
data[1] = data[1].map(json.loads)
_, mention_ent, text = data.iloc[6]
print(text)
print(mention_ent)
for match in get_matches(surface_trie, text):
# NIL match
weights = surface_weights[match]
print(match, weights)
# for label in vw_label_lines(weights, -1, comp, ent_feats):
# labels.append(label)
Zware aardbeving voor de kust in zuiden Mexico 8 september 2017 Zware aardbeving 8.2 voor de kust in zuiden Mexico De zuidkust van Mexico is omstreeks middernacht plaatselijke tijd getroffen door een aardbeving. De Mexicaanse seismologische dienst zegt dat de beving een kracht van 8.2 op de schaal van Richter had. Het centrum was in de Stille Oceaan, op 70 kilometer diepte, zo'n 100 kilometer ten zuidwesten van Pijijiapan. De meeste doden vielen in Oaxaca en het aangrenzende Chiapas. Ook in buurland Guatemala heeft de beving een dodelijk slachtoffer geëist. De aardbeving was ook te voelen in Mexico-Stad, waar mensen in paniek de straat op renden. De aardbeving zorgde even voor een grootschalige stroomuitval bij 1 miljoen mensen, die snel weer kon worden verholpen. De beving veroorzaakte ook een lichte tsunami. Een dergelijke krachtige aardbeving komt gemiddeld één keer per jaar voor. De vorige van dit kaliber was in 2015, in Chili. De laatste zware aardbeving in Mexico (waarbij vele duizenden doden vielen) was in 1985. Deze nieuwe beving is echter volgens president Nieto de krachtigste in Mexico in 100 jaar. Mexicaans president Enrique Peña Nieto heeft daags nadien drie dagen van nationale rouw afgekondigd. Het officiële dodental staat nu op 65. Vooral in Juchitan is veel schade aan woningen. Bronnen. Zelf schrijven? Hoe schrijf ik een artikel?
{'Oaxaca': 34110, 'Chiapas': 60123, 'Guatemala': 774, 'Mexico-Stad': 1489, 'in 1985': 1798567}
zware aardbeving {'Q214866': 2, 'Q43777': 15, 'Q151835': 2, 'Q211386': 3, 'Q1798567': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
8 september {'Q2850': 1588}
8½ {'Q12018': 22}
zware aardbeving {'Q214866': 2, 'Q43777': 15, 'Q151835': 2, 'Q211386': 3, 'Q1798567': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
8½ {'Q12018': 22}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
middernacht {'Q36402': 104}
plaatselijke tijd {'Q6940': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
mexicaanse {'Q96': 2011, 'Q581921': 2, 'Q764690': 2, 'Q207965': 4}
seismologische {'Q83371': 17}
beving {'Q1903316': 25}
8½ {'Q12018': 22}
de schaal van richter {'Q38768': 6}
schaal van richter {'Q38768': 719}
richter {'Q2777659': 54, 'Q81240': 4, 'Q2874233': 13, 'Q38768': 7, 'Q224831': 3}
stille oceaan {'Q184425': 2, 'Q98': 1274}
100 kilometer {'Q1847570': 5}
oaxaca {'Q13906835': 3, 'Q131429': 71, 'Q34110': 383, 'Q345582': 2}
chiapas {'Q12187096': 13, 'Q1426348': 16, 'Q60123': 394}
guatemala {'Q1536706': 5, 'Q844729': 3, 'Q695660': 33, 'Q3667955': 3, 'Q270168': 188, 'Q774': 2109}
beving {'Q1903316': 25}
slachtoffer {'Q1851760': 351, 'Q181600': 6}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
voelen {'Q205555': 14, 'Q328835': 16}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
mexico stad {'Q1489': 23}
mexico-stad {'Q1489': 2470, 'Q665894': 3, 'Q927513': 2}
paniek {'Q696490': 3, 'Q208450': 45, 'Q2788113': 5}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
stroomuitval {'Q828827': 45}
beving {'Q1903316': 25}
tsunami {'Q8070': 465, 'Q36204': 6, 'Q2169285': 6, 'Q1211774': 4, 'Q130754': 22, 'Q15042655': 21}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
kaliber {'Q170417': 393, 'Q3812588': 2, 'Q256690': 14}
chili {'Q147131': 2, 'Q863794': 2, 'Q683744': 2, 'Q165199': 3, 'Q1896262': 11, 'Q589856': 7, 'Q143856': 2, 'Q2342527': 2, 'Q172025': 757, 'Q2105105': 10, 'Q393170': 2, 'Q602656': 2, 'Q261405': 7, 'Q298': 4000, 'Q1072889': 2, 'Q755107': 2, 'Q15293622': 12, 'Q606832': 9}
zware aardbeving {'Q214866': 2, 'Q43777': 15, 'Q151835': 2, 'Q211386': 3, 'Q1798567': 2}
aardbeving {'Q214866': 3, 'Q1903316': 3, 'Q749610': 3, 'Q1053476': 2, 'Q116688': 8, 'Q16068825': 2, 'Q191055': 5, 'Q328663': 7, 'Q210137': 2, 'Q2441400': 7, 'Q43777': 9, 'Q1909872': 2, 'Q19830062': 5, 'Q2885880': 2, 'Q3245204': 3, 'Q274498': 2, 'Q27850765': 2, 'Q2483366': 2, 'Q3161000': 2, 'Q38821': 3, 'Q939106': 6, 'Q1082836': 3, 'Q130754': 2, 'Q1847006': 2, 'Q1348954': 3, 'Q26689701': 3, 'Q745411': 3, 'Q151835': 4, 'Q1990627': 2, 'Q1348910': 5, 'Q211386': 7, 'Q7944': 1417, 'Q867691': 5, 'Q152033': 4, 'Q47144124': 2, 'Q207918': 5, 'Q1798567': 3}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
beving {'Q1903316': 25}
nieto {'Q251846': 86}
mexico {'Q588480': 2, 'Q260965': 6, 'Q15296784': 18, 'Q599923': 19, 'Q6797304': 4, 'Q285658': 4, 'Q170603': 3, 'Q55712': 11, 'Q641583': 3, 'Q50647': 3, 'Q96': 10551, 'Q2151952': 12, 'Q941143': 2, 'Q8109': 2, 'Q48311380': 2, 'Q108810124': 4, 'Q51636908': 3, 'Q1489': 105, 'Q478399': 6, 'Q82112': 263, 'Q371334': 16, 'Q2294975': 9, 'Q764690': 8, 'Q2776455': 6, 'Q29066592': 3, 'Q603181': 3, 'Q173099': 2, 'Q1182352': 11, 'Q1166178': 3, 'Q1926260': 10, 'Q17041307': 5, 'Q150644': 2, 'Q2342098': 7, 'Q2712776': 3, 'Q12123144': 3, 'Q22696198': 2, 'Q164089': 908, 'Q1160682': 22, 'Q1907023': 4, 'Q3098723': 4, 'Q279769': 2, 'Q1041675': 2, 'Q387588': 3, 'Q19826175': 13, 'Q178652': 12, 'Q2376913': 2, 'Q43196466': 9, 'Q1414354': 3, 'Q8429': 7, 'Q979079': 6, 'Q1151279': 3}
mexicaans {'Q82112': 2, 'Q96': 2266}
enrique {'Q552058': 12, 'Q47122': 2, 'Q1803019': 2, 'Q2290535': 2}
enrique peña {'Q296741': 5}
enrique peña nieto {'Q296741': 36}
peña {'Q3313041': 8, 'Q417545': 4, 'Q2497712': 6, 'Q6125576': 13, 'Q532103': 12}
nieto {'Q251846': 86}
dagen van nationale rouw {'Q1127742': 16}
nationale rouw {'Q13878715': 2, 'Q1127742': 18}
rouw {'Q2450206': 2, 'Q750652': 157}
afgekondigd {'Q446780': 2}
dodental {'Q1435473': 3}
[ ]:
[75]:
df2.loc["op de rails"]
[75]:
wc 1.0
aec 59.0
ratio 0.0
ev_count 2.0
ev_linked 0.0
Name: op de rails, dtype: Float64
[5]:
from minimel.vectorize import vw
import pathlib
fname = "wiki/nlwiki-20220301/experiments/clean-q0.25.json"
vw(
['id\t{"amsterdam": 9899}\tik ging naar amsterdam en Rotterdam'],
pathlib.Path(fname),
)
[5]:
['shared |s ik ging naar amsterdam en rotterdam',
'9899:0 |l amsterdam=9899 ',
'727:1 |l amsterdam=727 ',
'478771:1 |l amsterdam=478771 ',
'50719:1 |l amsterdam=50719 ',
'2060132:1 |l amsterdam=2060132 ',
'214341:1 |l amsterdam=214341 ',
'194215:1 |l amsterdam=194215 ',
'26674641:1 |l amsterdam=26674641 ',
'1127380:1 |l amsterdam=1127380 ',
'9694:1 |l amsterdam=9694 ',
'2691069:1 |l amsterdam=2691069 ',
'2393888:1 |l amsterdam=2393888 ',
'478785:1 |l amsterdam=478785 ',
'1397383:1 |l amsterdam=1397383 ',
'122781:1 |l amsterdam=122781 ',
'959016:1 |l amsterdam=959016 ',
'5715626:1 |l amsterdam=5715626 ',
'2060246:1 |l amsterdam=2060246 ',
'4748823:1 |l amsterdam=4748823 ',
'2049529:1 |l amsterdam=2049529 ',
'95630476:1 |l amsterdam=95630476 ',
'683829:1 |l amsterdam=683829 ',
'478456:1 |l amsterdam=478456 ',
'4549041:1 |l amsterdam=4549041 ',
'13427674:1 |l amsterdam=13427674 ',
'585429:1 |l amsterdam=585429 ',
'505639:1 |l amsterdam=505639 ',
'2276292:1 |l amsterdam=2276292 ',
'-1:0 |l ik=-1 ',
'49404:1 |l ik=49404 ',
'2422744:1 |l ik=2422744 ',
'1143089:1 |l ik=1143089 ',
'-1:0 |l en=-1 ',
'1860:1 |l en=1860 ',
'1377447:1 |l en=1377447 ',
'191081:1 |l en=191081 ',
'933:1 |l en=933 ',
'-1:0 |l rotterdam=-1 ',
'34370:1 |l rotterdam=34370 ',
'2680952:1 |l rotterdam=2680952 ',
'201284:1 |l rotterdam=201284 ',
'801388:1 |l rotterdam=801388 ',
'1027807:1 |l rotterdam=1027807 ',
'166127:1 |l rotterdam=166127 ',
'1429091:1 |l rotterdam=1429091 ',
'783945:1 |l rotterdam=783945 ',
'2329167:1 |l rotterdam=2329167 ',
'931364:1 |l rotterdam=931364 ',
'1341108:1 |l rotterdam=1341108 ',
'299701:1 |l rotterdam=299701 ',
'2166239:1 |l rotterdam=2166239 ',
'2856391:1 |l rotterdam=2856391 ',
'656807:1 |l rotterdam=656807 ',
'633529:1 |l rotterdam=633529 ',
'849230:1 |l rotterdam=849230 ',
'1774596:1 |l rotterdam=1774596 ',
'2930113:1 |l rotterdam=2930113 ',
'1866458:1 |l rotterdam=1866458 ',
'',
'']