Unobserved mentions
[4]:
import pathlib
# wiki, version, langcode = 'simplewiki', '20211120', 'en'
# wiki, version, langcode = 'tawiki', '20220301', 'ta'
# wiki, version, langcode = 'fawiki', '20220301', 'fa'
# wiki, version, langcode = 'trwiki', '20220301', 'tr'
wiki, version, langcode = 'nlwiki', '20220301', 'nl'
# wiki, version, langcode = 'arwiki', '20220301', 'ar'
# wiki, version, langcode = 'srwiki', '20220301', 'sr'
# wiki, version, langcode = 'eswiki', '20220301', 'es'
# wiki, version, langcode = 'jawiki', '20220301', 'ja'
# wiki, version, langcode = 'dewiki', '20220301', 'de'
fgold = pathlib.Path(f"evaluation/Mewsli-9/{langcode}.tsv")
[14]:
import pandas as pd
import json, glob, dawg
from minimel.normalize import normalize
counts = json.load(open(f"wiki/{wiki}-{version}/count.min2.json"))
disambig = set(pd.read_csv("data/wikidata-20211122-disambig.txt", header=None)[0])
index = glob.glob(f"wiki/{wiki}-*/index_{wiki}-*.dawg")[0]
wm = dawg.IntDAWG().load(index)
counts = {k:{a.replace('Q',''):b for a,b in v.items()} for k,v in counts.items()}
def rank(wm, counts, surface, i, stem=None):
"""Return count and rank (rank = -1 means only in page title index)"""
for norm in normalize(surface, language=stem):
norm = norm.lower()
ent_count = counts.get(norm, {})
if len(ent_count) == 1 and ent_count.get("%s" % i):
return ent_count["%s" % i], -1
if ent_count.get("%s" % i):
order = sorted(ent_count, key=lambda x: -ent_count[x])
return ent_count["%s" % i], order.index("%s" % i)
if wm.get(surface[0].upper() + surface[1:].replace(" ", "_")) == i:
return 1, -1
return 0, None
ranks = pd.DataFrame(
[
(surface, ref, *rank(wm, counts, surface, ref))
for _, links, text in (l.split("\t") for l in fgold.open())
for surface, ref in json.loads(links).items()
if ref not in disambig
],
columns=["surface", "ref", "count", "rank"],
)
ranks
[14]:
| surface | ref | count | rank | |
|---|---|---|---|---|
| 0 | Tangerang | 10127 | 124 | -1.0 |
| 1 | kortsluiting | 206907 | 115 | 0.0 |
| 2 | Bunq | 24084541 | 13 | -1.0 |
| 3 | Mosoel | 83317 | 250 | -1.0 |
| 4 | Abadi | 5639013 | 0 | NaN |
| ... | ... | ... | ... | ... |
| 11341 | VN-Veiligheidsraad | 37470 | 1721 | -1.0 |
| 11342 | Verenigde Naties | 1065 | 4826 | 0.0 |
| 11343 | Paramacaanse | 2105235 | 0 | NaN |
| 11344 | Langatabbetje | 2782195 | 14 | -1.0 |
| 11345 | Ceuta | 5823 | 257 | 0.0 |
11346 rows × 4 columns
[16]:
test_counts = ranks.fillna("na").value_counts().rename("test_count").reset_index()
test_counts.query('(count == 0) & (rank == "na")').head(20)
[16]:
| surface | ref | count | rank | test_count | |
|---|---|---|---|---|---|
| 327 | Nations League | 29000951 | 0 | na | 4 |
| 365 | World Solar Challenge | 14520244 | 0 | na | 4 |
| 429 | Abadi | 5639013 | 0 | na | 3 |
| 457 | Le Pen | 12927 | 0 | na | 3 |
| 635 | Trudeau | 3099714 | 0 | na | 3 |
| 703 | Wereldkampioenschap voetbal in Mexico | 46938 | 0 | na | 2 |
| 755 | VK.com | 116933 | 0 | na | 2 |
| 772 | haat in Brussel | 23365300 | 0 | na | 2 |
| 886 | Mattarella | 3956186 | 0 | na | 2 |
| 1010 | brug | 2222042 | 0 | na | 2 |
| 1034 | Litvinenko | 87015 | 0 | na | 2 |
| 1058 | Team BMC | 787401 | 0 | na | 2 |
| 1076 | Tweede Kamerverkiezingen van 15 maart | 16061881 | 0 | na | 2 |
| 1082 | getaserd | 870114 | 0 | na | 2 |
| 1149 | TUI fly | 664509 | 0 | na | 2 |
| 1151 | MOAS | 18209263 | 0 | na | 2 |
| 1194 | tsunami | 60042839 | 0 | na | 2 |
| 1208 | vale gieren | 177856 | 0 | na | 2 |
| 1217 | Amerikaanse presidentschap | 45578 | 0 | na | 2 |
| 1281 | NPB | 2113973 | 0 | na | 2 |
Hardest entities
[17]:
hard_ents = test_counts.query("rank==1")["surface"]
test_counts.set_index("surface").loc[list(hard_ents)]
[17]:
| ref | count | rank | test_count | |
|---|---|---|---|---|
| surface | ||||
| Openbaar Ministerie | 2595790 | 553 | 0.0 | 19 |
| Openbaar Ministerie | 11775750 | 312 | 1.0 | 6 |
| China | 148 | 6406 | 1.0 | 5 |
| China | 29520 | 8211 | 0.0 | 5 |
| Peter Madsen | 2076849 | 10 | 1.0 | 5 |
| ... | ... | ... | ... | ... |
| Koninginnedag | 2598945 | 10 | 1.0 | 1 |
| Koninginnedag | 333016 | 212 | 0.0 | 1 |
| Kodiak | 79985 | 21 | 1.0 | 1 |
| Kos | 568212 | 33 | 1.0 | 1 |
| Kirkoek | 193268 | 11 | 1.0 | 1 |
422 rows × 4 columns
[ ]: