Unobserved mentions

[4]:
import pathlib

# wiki, version, langcode = 'simplewiki', '20211120', 'en'
# wiki, version, langcode = 'tawiki', '20220301', 'ta'
# wiki, version, langcode = 'fawiki', '20220301', 'fa'
# wiki, version, langcode = 'trwiki', '20220301', 'tr'
wiki, version, langcode = 'nlwiki', '20220301', 'nl'
# wiki, version, langcode = 'arwiki', '20220301', 'ar'
# wiki, version, langcode = 'srwiki', '20220301', 'sr'
# wiki, version, langcode = 'eswiki', '20220301', 'es'
# wiki, version, langcode = 'jawiki', '20220301', 'ja'
# wiki, version, langcode = 'dewiki', '20220301', 'de'

fgold = pathlib.Path(f"evaluation/Mewsli-9/{langcode}.tsv")
[14]:
import pandas as pd
import json, glob, dawg
from minimel.normalize import normalize

counts = json.load(open(f"wiki/{wiki}-{version}/count.min2.json"))
disambig = set(pd.read_csv("data/wikidata-20211122-disambig.txt", header=None)[0])
index = glob.glob(f"wiki/{wiki}-*/index_{wiki}-*.dawg")[0]
wm = dawg.IntDAWG().load(index)


counts = {k:{a.replace('Q',''):b for a,b in v.items()} for k,v in counts.items()}

def rank(wm, counts, surface, i, stem=None):
    """Return count and rank (rank = -1 means only in page title index)"""
    for norm in normalize(surface, language=stem):
        norm = norm.lower()
        ent_count = counts.get(norm, {})
        if len(ent_count) == 1 and ent_count.get("%s" % i):
            return ent_count["%s" % i], -1
        if ent_count.get("%s" % i):
            order = sorted(ent_count, key=lambda x: -ent_count[x])
            return ent_count["%s" % i], order.index("%s" % i)
    if wm.get(surface[0].upper() + surface[1:].replace(" ", "_")) == i:
        return 1, -1
    return 0, None


ranks = pd.DataFrame(
    [
        (surface, ref, *rank(wm, counts, surface, ref))
        for _, links, text in (l.split("\t") for l in fgold.open())
        for surface, ref in json.loads(links).items()
        if ref not in disambig
    ],
    columns=["surface", "ref", "count", "rank"],
)
ranks
[14]:
surface ref count rank
0 Tangerang 10127 124 -1.0
1 kortsluiting 206907 115 0.0
2 Bunq 24084541 13 -1.0
3 Mosoel 83317 250 -1.0
4 Abadi 5639013 0 NaN
... ... ... ... ...
11341 VN-Veiligheidsraad 37470 1721 -1.0
11342 Verenigde Naties 1065 4826 0.0
11343 Paramacaanse 2105235 0 NaN
11344 Langatabbetje 2782195 14 -1.0
11345 Ceuta 5823 257 0.0

11346 rows × 4 columns

[16]:
test_counts = ranks.fillna("na").value_counts().rename("test_count").reset_index()
test_counts.query('(count == 0) & (rank == "na")').head(20)
[16]:
surface ref count rank test_count
327 Nations League 29000951 0 na 4
365 World Solar Challenge 14520244 0 na 4
429 Abadi 5639013 0 na 3
457 Le Pen 12927 0 na 3
635 Trudeau 3099714 0 na 3
703 Wereldkampioenschap voetbal in Mexico 46938 0 na 2
755 VK.com 116933 0 na 2
772 haat in Brussel 23365300 0 na 2
886 Mattarella 3956186 0 na 2
1010 brug 2222042 0 na 2
1034 Litvinenko 87015 0 na 2
1058 Team BMC 787401 0 na 2
1076 Tweede Kamerverkiezingen van 15 maart 16061881 0 na 2
1082 getaserd 870114 0 na 2
1149 TUI fly 664509 0 na 2
1151 MOAS 18209263 0 na 2
1194 tsunami 60042839 0 na 2
1208 vale gieren 177856 0 na 2
1217 Amerikaanse presidentschap 45578 0 na 2
1281 NPB 2113973 0 na 2

Hardest entities

[17]:
hard_ents = test_counts.query("rank==1")["surface"]
test_counts.set_index("surface").loc[list(hard_ents)]
[17]:
ref count rank test_count
surface
Openbaar Ministerie 2595790 553 0.0 19
Openbaar Ministerie 11775750 312 1.0 6
China 148 6406 1.0 5
China 29520 8211 0.0 5
Peter Madsen 2076849 10 1.0 5
... ... ... ... ...
Koninginnedag 2598945 10 1.0 1
Koninginnedag 333016 212 0.0 1
Kodiak 79985 21 1.0 1
Kos 568212 33 1.0 1
Kirkoek 193268 11 1.0 1

422 rows × 4 columns

[ ]: