{ "cells": [ { "cell_type": "markdown", "id": "bbf5fc11-cd5b-41f8-90ab-f4a3162e264f", "metadata": {}, "source": [ "# Unobserved mentions" ] }, { "cell_type": "code", "execution_count": 4, "id": "ac476b09-2a68-4031-bf94-801605aa961f", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pathlib\n", "\n", "# wiki, version, langcode = 'simplewiki', '20211120', 'en'\n", "# wiki, version, langcode = 'tawiki', '20220301', 'ta'\n", "# wiki, version, langcode = 'fawiki', '20220301', 'fa'\n", "# wiki, version, langcode = 'trwiki', '20220301', 'tr'\n", "wiki, version, langcode = 'nlwiki', '20220301', 'nl'\n", "# wiki, version, langcode = 'arwiki', '20220301', 'ar'\n", "# wiki, version, langcode = 'srwiki', '20220301', 'sr'\n", "# wiki, version, langcode = 'eswiki', '20220301', 'es'\n", "# wiki, version, langcode = 'jawiki', '20220301', 'ja'\n", "# wiki, version, langcode = 'dewiki', '20220301', 'de'\n", "\n", "fgold = pathlib.Path(f\"evaluation/Mewsli-9/{langcode}.tsv\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "3a8e22a0-9da3-4bbf-83f0-2995c696532f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
surfacerefcountrank
0Tangerang10127124-1.0
1kortsluiting2069071150.0
2Bunq2408454113-1.0
3Mosoel83317250-1.0
4Abadi56390130NaN
...............
11341VN-Veiligheidsraad374701721-1.0
11342Verenigde Naties106548260.0
11343Paramacaanse21052350NaN
11344Langatabbetje278219514-1.0
11345Ceuta58232570.0
\n", "

11346 rows × 4 columns

\n", "
" ], "text/plain": [ " surface ref count rank\n", "0 Tangerang 10127 124 -1.0\n", "1 kortsluiting 206907 115 0.0\n", "2 Bunq 24084541 13 -1.0\n", "3 Mosoel 83317 250 -1.0\n", "4 Abadi 5639013 0 NaN\n", "... ... ... ... ...\n", "11341 VN-Veiligheidsraad 37470 1721 -1.0\n", "11342 Verenigde Naties 1065 4826 0.0\n", "11343 Paramacaanse 2105235 0 NaN\n", "11344 Langatabbetje 2782195 14 -1.0\n", "11345 Ceuta 5823 257 0.0\n", "\n", "[11346 rows x 4 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import json, glob, dawg\n", "from minimel.normalize import normalize\n", "\n", "counts = json.load(open(f\"wiki/{wiki}-{version}/count.min2.json\"))\n", "disambig = set(pd.read_csv(\"data/wikidata-20211122-disambig.txt\", header=None)[0])\n", "index = glob.glob(f\"wiki/{wiki}-*/index_{wiki}-*.dawg\")[0]\n", "wm = dawg.IntDAWG().load(index)\n", "\n", "\n", "counts = {k:{a.replace('Q',''):b for a,b in v.items()} for k,v in counts.items()}\n", "\n", "def rank(wm, counts, surface, i, stem=None):\n", " \"\"\"Return count and rank (rank = -1 means only in page title index)\"\"\"\n", " for norm in normalize(surface, language=stem):\n", " norm = norm.lower()\n", " ent_count = counts.get(norm, {})\n", " if len(ent_count) == 1 and ent_count.get(\"%s\" % i):\n", " return ent_count[\"%s\" % i], -1\n", " if ent_count.get(\"%s\" % i):\n", " order = sorted(ent_count, key=lambda x: -ent_count[x])\n", " return ent_count[\"%s\" % i], order.index(\"%s\" % i)\n", " if wm.get(surface[0].upper() + surface[1:].replace(\" \", \"_\")) == i:\n", " return 1, -1\n", " return 0, None\n", "\n", "\n", "ranks = pd.DataFrame(\n", " [\n", " (surface, ref, *rank(wm, counts, surface, ref))\n", " for _, links, text in (l.split(\"\\t\") for l in fgold.open())\n", " for surface, ref in json.loads(links).items()\n", " if ref not in disambig\n", " ],\n", " columns=[\"surface\", \"ref\", \"count\", \"rank\"],\n", ")\n", "ranks" ] }, { "cell_type": "code", "execution_count": 16, "id": "d4787345-56cc-4c7a-b1f1-02824612da5b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
surfacerefcountranktest_count
327Nations League290009510na4
365World Solar Challenge145202440na4
429Abadi56390130na3
457Le Pen129270na3
635Trudeau30997140na3
703Wereldkampioenschap voetbal in Mexico469380na2
755VK.com1169330na2
772haat in Brussel233653000na2
886Mattarella39561860na2
1010brug22220420na2
1034Litvinenko870150na2
1058Team BMC7874010na2
1076Tweede Kamerverkiezingen van 15 maart160618810na2
1082getaserd8701140na2
1149TUI fly6645090na2
1151MOAS182092630na2
1194tsunami600428390na2
1208vale gieren1778560na2
1217Amerikaanse presidentschap455780na2
1281NPB21139730na2
\n", "
" ], "text/plain": [ " surface ref count rank test_count\n", "327 Nations League 29000951 0 na 4\n", "365 World Solar Challenge 14520244 0 na 4\n", "429 Abadi 5639013 0 na 3\n", "457 Le Pen 12927 0 na 3\n", "635 Trudeau 3099714 0 na 3\n", "703 Wereldkampioenschap voetbal in Mexico 46938 0 na 2\n", "755 VK.com 116933 0 na 2\n", "772 haat in Brussel 23365300 0 na 2\n", "886 Mattarella 3956186 0 na 2\n", "1010 brug 2222042 0 na 2\n", "1034 Litvinenko 87015 0 na 2\n", "1058 Team BMC 787401 0 na 2\n", "1076 Tweede Kamerverkiezingen van 15 maart 16061881 0 na 2\n", "1082 getaserd 870114 0 na 2\n", "1149 TUI fly 664509 0 na 2\n", "1151 MOAS 18209263 0 na 2\n", "1194 tsunami 60042839 0 na 2\n", "1208 vale gieren 177856 0 na 2\n", "1217 Amerikaanse presidentschap 45578 0 na 2\n", "1281 NPB 2113973 0 na 2" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_counts = ranks.fillna(\"na\").value_counts().rename(\"test_count\").reset_index()\n", "test_counts.query('(count == 0) & (rank == \"na\")').head(20)" ] }, { "cell_type": "markdown", "id": "4b9b3967-3a0b-4a07-80da-ffeb8e27adbc", "metadata": {}, "source": [ "## Hardest entities" ] }, { "cell_type": "code", "execution_count": 17, "id": "cc5ac0f5-853f-4f14-8169-153d7940362c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
refcountranktest_count
surface
Openbaar Ministerie25957905530.019
Openbaar Ministerie117757503121.06
China14864061.05
China2952082110.05
Peter Madsen2076849101.05
...............
Koninginnedag2598945101.01
Koninginnedag3330162120.01
Kodiak79985211.01
Kos568212331.01
Kirkoek193268111.01
\n", "

422 rows × 4 columns

\n", "
" ], "text/plain": [ " ref count rank test_count\n", "surface \n", "Openbaar Ministerie 2595790 553 0.0 19\n", "Openbaar Ministerie 11775750 312 1.0 6\n", "China 148 6406 1.0 5\n", "China 29520 8211 0.0 5\n", "Peter Madsen 2076849 10 1.0 5\n", "... ... ... ... ...\n", "Koninginnedag 2598945 10 1.0 1\n", "Koninginnedag 333016 212 0.0 1\n", "Kodiak 79985 21 1.0 1\n", "Kos 568212 33 1.0 1\n", "Kirkoek 193268 11 1.0 1\n", "\n", "[422 rows x 4 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hard_ents = test_counts.query(\"rank==1\")[\"surface\"]\n", "test_counts.set_index(\"surface\").loc[list(hard_ents)]" ] }, { "cell_type": "code", "execution_count": null, "id": "8b77b915-6a5e-4a68-ac4d-2ac8c168d7e1", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" } }, "nbformat": 4, "nbformat_minor": 5 }