{ "cells": [ { "cell_type": "markdown", "id": "bbf5fc11-cd5b-41f8-90ab-f4a3162e264f", "metadata": {}, "source": [ "# Unobserved mentions" ] }, { "cell_type": "code", "execution_count": 4, "id": "ac476b09-2a68-4031-bf94-801605aa961f", "metadata": { "tags": [] }, "outputs": [], "source": [ "import pathlib\n", "\n", "# wiki, version, langcode = 'simplewiki', '20211120', 'en'\n", "# wiki, version, langcode = 'tawiki', '20220301', 'ta'\n", "# wiki, version, langcode = 'fawiki', '20220301', 'fa'\n", "# wiki, version, langcode = 'trwiki', '20220301', 'tr'\n", "wiki, version, langcode = 'nlwiki', '20220301', 'nl'\n", "# wiki, version, langcode = 'arwiki', '20220301', 'ar'\n", "# wiki, version, langcode = 'srwiki', '20220301', 'sr'\n", "# wiki, version, langcode = 'eswiki', '20220301', 'es'\n", "# wiki, version, langcode = 'jawiki', '20220301', 'ja'\n", "# wiki, version, langcode = 'dewiki', '20220301', 'de'\n", "\n", "fgold = pathlib.Path(f\"evaluation/Mewsli-9/{langcode}.tsv\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "3a8e22a0-9da3-4bbf-83f0-2995c696532f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | surface | \n", "ref | \n", "count | \n", "rank | \n", "
|---|---|---|---|---|
| 0 | \n", "Tangerang | \n", "10127 | \n", "124 | \n", "-1.0 | \n", "
| 1 | \n", "kortsluiting | \n", "206907 | \n", "115 | \n", "0.0 | \n", "
| 2 | \n", "Bunq | \n", "24084541 | \n", "13 | \n", "-1.0 | \n", "
| 3 | \n", "Mosoel | \n", "83317 | \n", "250 | \n", "-1.0 | \n", "
| 4 | \n", "Abadi | \n", "5639013 | \n", "0 | \n", "NaN | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 11341 | \n", "VN-Veiligheidsraad | \n", "37470 | \n", "1721 | \n", "-1.0 | \n", "
| 11342 | \n", "Verenigde Naties | \n", "1065 | \n", "4826 | \n", "0.0 | \n", "
| 11343 | \n", "Paramacaanse | \n", "2105235 | \n", "0 | \n", "NaN | \n", "
| 11344 | \n", "Langatabbetje | \n", "2782195 | \n", "14 | \n", "-1.0 | \n", "
| 11345 | \n", "Ceuta | \n", "5823 | \n", "257 | \n", "0.0 | \n", "
11346 rows × 4 columns
\n", "| \n", " | surface | \n", "ref | \n", "count | \n", "rank | \n", "test_count | \n", "
|---|---|---|---|---|---|
| 327 | \n", "Nations League | \n", "29000951 | \n", "0 | \n", "na | \n", "4 | \n", "
| 365 | \n", "World Solar Challenge | \n", "14520244 | \n", "0 | \n", "na | \n", "4 | \n", "
| 429 | \n", "Abadi | \n", "5639013 | \n", "0 | \n", "na | \n", "3 | \n", "
| 457 | \n", "Le Pen | \n", "12927 | \n", "0 | \n", "na | \n", "3 | \n", "
| 635 | \n", "Trudeau | \n", "3099714 | \n", "0 | \n", "na | \n", "3 | \n", "
| 703 | \n", "Wereldkampioenschap voetbal in Mexico | \n", "46938 | \n", "0 | \n", "na | \n", "2 | \n", "
| 755 | \n", "VK.com | \n", "116933 | \n", "0 | \n", "na | \n", "2 | \n", "
| 772 | \n", "haat in Brussel | \n", "23365300 | \n", "0 | \n", "na | \n", "2 | \n", "
| 886 | \n", "Mattarella | \n", "3956186 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1010 | \n", "brug | \n", "2222042 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1034 | \n", "Litvinenko | \n", "87015 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1058 | \n", "Team BMC | \n", "787401 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1076 | \n", "Tweede Kamerverkiezingen van 15 maart | \n", "16061881 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1082 | \n", "getaserd | \n", "870114 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1149 | \n", "TUI fly | \n", "664509 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1151 | \n", "MOAS | \n", "18209263 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1194 | \n", "tsunami | \n", "60042839 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1208 | \n", "vale gieren | \n", "177856 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1217 | \n", "Amerikaanse presidentschap | \n", "45578 | \n", "0 | \n", "na | \n", "2 | \n", "
| 1281 | \n", "NPB | \n", "2113973 | \n", "0 | \n", "na | \n", "2 | \n", "
| \n", " | ref | \n", "count | \n", "rank | \n", "test_count | \n", "
|---|---|---|---|---|
| surface | \n", "\n", " | \n", " | \n", " | \n", " |
| Openbaar Ministerie | \n", "2595790 | \n", "553 | \n", "0.0 | \n", "19 | \n", "
| Openbaar Ministerie | \n", "11775750 | \n", "312 | \n", "1.0 | \n", "6 | \n", "
| China | \n", "148 | \n", "6406 | \n", "1.0 | \n", "5 | \n", "
| China | \n", "29520 | \n", "8211 | \n", "0.0 | \n", "5 | \n", "
| Peter Madsen | \n", "2076849 | \n", "10 | \n", "1.0 | \n", "5 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| Koninginnedag | \n", "2598945 | \n", "10 | \n", "1.0 | \n", "1 | \n", "
| Koninginnedag | \n", "333016 | \n", "212 | \n", "0.0 | \n", "1 | \n", "
| Kodiak | \n", "79985 | \n", "21 | \n", "1.0 | \n", "1 | \n", "
| Kos | \n", "568212 | \n", "33 | \n", "1.0 | \n", "1 | \n", "
| Kirkoek | \n", "193268 | \n", "11 | \n", "1.0 | \n", "1 | \n", "
422 rows × 4 columns
\n", "