{ "cells": [ { "cell_type": "markdown", "id": "41ce243f-03a5-4cda-9e54-b099efac1acc", "metadata": {}, "source": [ "# Inspect Model Features" ] }, { "cell_type": "code", "execution_count": 2, "id": "1b2b99f2-bfb3-4b28-8fd6-62c47e3a4efd", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Obtaining file:///home/jupyter-benno/minimEL\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hInstalling collected packages: minimel\n", " Attempting uninstall: minimel\n", " Found existing installation: minimel 0.1\n", " Uninstalling minimel-0.1:\n", " Successfully uninstalled minimel-0.1\n", " Running setup.py develop for minimel\n", "Successfully installed minimel-0.1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "creating quadratic features for pairs: ls\n", "only testing\n", "using no cache\n", "Reading datafile = none\n", "num sources = 0\n", "Num weight bits = 24\n", "learning rate = 0.5\n", "initial_t = 0\n", "power_t = 0.5\n", "Enabled learners: gd, scorer-identity, csoaa_ldf-prob, shared_feature_merger\n", "Input label = CS\n", "Output pred = SCALARS\n", "average since example example current current current\n", "loss last counter weight label predict features\n", "0.000000 0.000000 1 1.0 known 221653 208\n", "0.000000 0.000000 2 2.0 unknown 0 3172\n", "0.000000 0.000000 4 4.0 unknown 0 1664\n", "0.000000 0.000000 8 8.0 unknown 0 2418\n", "0.000000 0.000000 16 16.0 unknown 0 1794\n", "0.000000 0.000000 32 32.0 unknown 0 1898\n", "0.000000 0.000000 64 64.0 known 776 910\n", "0.007812 0.015625 128 128.0 unknown 0 8294\n", "0.003906 0.000000 256 256.0 unknown 0 10504\n", "0.003906 0.003906 512 512.0 unknown 0 1248\n", "{803: 588, 776: 229, 707767: 94, 261716: 38, 575655: 21, 221653: 14, 2679365: 11, 18108: 4, 24680: 2}\n", "\n", "finished run\n", "number of examples = 1001\n", "weighted example sum = 1001.000000\n", "weighted label sum = 0.000000\n", "average loss = 0.008991\n", "average multiclass log loss = 0.449134\n", "total feature number = 2910232\n" ] }, { "data": { "text/plain": [ "253045" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "!pip install --user -e .\n", "# surface = 'madrid'\n", "surface = \"utrecht\"\n", "# wiki = 'simplewiki-20211120'\n", "wiki = \"nlwiki-20220301\"\n", "# wiki = 'eswiki-20220301'\n", "modelfile = f\"../data/wiki/{wiki}/experiments/clean-q0.25.24b.vw\"\n", "datafile = f\"../data/wiki/{wiki}/experiments/clean-q0.25.dat\"\n", "\n", "import subprocess\n", "\n", "args = [\"python\", \"-m\", \"minimel\", \"audit\", modelfile, datafile, surface]\n", "feats = set()\n", "with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=None) as process:\n", " for line in process.stdout:\n", " line = line.decode(\"utf8\").rstrip()\n", " if line.startswith(\"\\t\"):\n", " feats.update(set(f for f in line[1:].split(\"\\t\") if f.startswith(\"l^\")))\n", "len(feats)" ] }, { "cell_type": "code", "execution_count": 3, "id": "3c6ef0b1-a3d1-4abd-af9e-4d4887088d1c", "metadata": { "tags": [] }, "outputs": [], "source": [ "import io\n", "import pandas as pd\n", "\n", "df = pd.read_csv(\n", " io.StringIO(\"\\n\".join(feats)),\n", " sep=\"\\*|:|\\^|=\",\n", " header=None,\n", " engine=\"python\",\n", " usecols=[2, 4, 7],\n", " names=[\"wid\", \"feat\", \"weight\"],\n", ").dropna()\n", "\n", "# Normalize weights\n", "df[\"weight\"] = -(df[\"weight\"] - df.groupby(\"feat\")[\"weight\"].transform(\"mean\"))" ] }, { "cell_type": "code", "execution_count": 4, "id": "a768a0df-5a61-4478-b0d9-045ece31f532", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{776: 'Utrecht_(provincie)', 803: 'Utrecht_(stad)', 18108: 'Utrecht_(Zuid-Afrika)', 24680: 'FC_Utrecht', 221653: 'Universiteit_Utrecht', 261716: 'Aartsbisdom_Utrecht_(rooms-katholiek)', 575655: 'Station_Utrecht_Centraal', 707767: 'Sticht_Utrecht', 847384: 'Utrechts_Conservatorium', 2012748: 'Vechtsebanen', 2193594: 'Hr.Ms._Utrecht_(1901)', 2679365: 'Heerlijkheid_Utrecht', 85308316: 'BVC_Utrecht'}\n" ] } ], "source": [ "select_ents = set(df[\"wid\"].unique())\n", "\n", "import sqlite3\n", "\n", "con = sqlite3.connect(f\"../data/wiki/{wiki}/index_{wiki}.db\")\n", "ent_label = {}\n", "for e in select_ents:\n", " l = pd.read_sql_query(\n", " f'select * from mapping where wikidata_id=\"Q{e}\" limit 1', con\n", " )\n", " ent_label[e] = l[\"wikipedia_title\"][0]\n", "\n", "print(dict(sorted(ent_label.items())))" ] }, { "cell_type": "code", "execution_count": 5, "id": "24882acc-fd67-4c19-901c-401c7c7e4af5", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "
| \n", " | \n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|
| Utrecht_(provincie) | \n", "feat | \n", "provincie | \n", "wegverkeer | \n", "stuurde | \n", "baarn | \n", "vormentaal | \n", "samenspraak | \n", "filosofie | \n", "vakantieoord | \n", "knutselen | \n", "cartoonist | \n", "
| weight | \n", "1.98 | \n", "-1.26 | \n", "-1.07 | \n", "0.87 | \n", "0.86 | \n", "-0.86 | \n", "0.84 | \n", "0.83 | \n", "0.80 | \n", "-0.80 | \n", "|
| Utrecht_(stad) | \n", "feat | \n", "blijkt | \n", "utrecht | \n", "provincie | \n", "belangrijkste | \n", "niedersächsisch | \n", "anc | \n", "stad | \n", "museum | \n", "verplegings | \n", "ondeelbaar | \n", "
| weight | \n", "-1.74 | \n", "1.38 | \n", "-1.18 | \n", "-1.10 | \n", "-1.06 | \n", "-1.04 | \n", "0.88 | \n", "0.87 | \n", "-0.87 | \n", "-0.84 | \n", "|
| Utrecht_(Zuid-Afrika) | \n", "feat | \n", "piraten | \n", "pionier | \n", "balmat | \n", "fusieplan | \n", "polemieken | \n", "ontario | \n", "uitreiking | \n", "university | \n", "anna | \n", "zalige | \n", "
| weight | \n", "-1.19 | \n", "-1.11 | \n", "-0.89 | \n", "-0.83 | \n", "-0.80 | \n", "0.79 | \n", "-0.76 | \n", "0.76 | \n", "0.75 | \n", "-0.75 | \n", "|
| FC_Utrecht | \n", "feat | \n", "universiteit | \n", "perks | \n", "brandlaag | \n", "charter | \n", "geslagen | \n", "draait | \n", "krugersdorp | \n", "temperature | \n", "kessels | \n", "behandelen | \n", "
| weight | \n", "1.03 | \n", "-1.01 | \n", "-0.84 | \n", "0.81 | \n", "-0.74 | \n", "0.71 | \n", "-0.70 | \n", "0.66 | \n", "-0.66 | \n", "-0.65 | \n", "|
| Universiteit_Utrecht | \n", "feat | \n", "orde | \n", "kaunda | \n", "romanpersonage | \n", "opgelegd | \n", "work | \n", "vilain | \n", "pestepidemie | \n", "wet | \n", "voorafgegaan | \n", "yntema | \n", "
| weight | \n", "-1.30 | \n", "0.92 | \n", "-0.87 | \n", "-0.84 | \n", "-0.82 | \n", "-0.82 | \n", "0.80 | \n", "-0.78 | \n", "-0.72 | \n", "0.70 | \n", "