{ "cells": [ { "cell_type": "markdown", "id": "41ce243f-03a5-4cda-9e54-b099efac1acc", "metadata": {}, "source": [ "# Inspect Model Features" ] }, { "cell_type": "code", "execution_count": 2, "id": "1b2b99f2-bfb3-4b28-8fd6-62c47e3a4efd", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Obtaining file:///home/jupyter-benno/minimEL\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hInstalling collected packages: minimel\n", " Attempting uninstall: minimel\n", " Found existing installation: minimel 0.1\n", " Uninstalling minimel-0.1:\n", " Successfully uninstalled minimel-0.1\n", " Running setup.py develop for minimel\n", "Successfully installed minimel-0.1\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "creating quadratic features for pairs: ls\n", "only testing\n", "using no cache\n", "Reading datafile = none\n", "num sources = 0\n", "Num weight bits = 24\n", "learning rate = 0.5\n", "initial_t = 0\n", "power_t = 0.5\n", "Enabled learners: gd, scorer-identity, csoaa_ldf-prob, shared_feature_merger\n", "Input label = CS\n", "Output pred = SCALARS\n", "average since example example current current current\n", "loss last counter weight label predict features\n", "0.000000 0.000000 1 1.0 known 221653 208\n", "0.000000 0.000000 2 2.0 unknown 0 3172\n", "0.000000 0.000000 4 4.0 unknown 0 1664\n", "0.000000 0.000000 8 8.0 unknown 0 2418\n", "0.000000 0.000000 16 16.0 unknown 0 1794\n", "0.000000 0.000000 32 32.0 unknown 0 1898\n", "0.000000 0.000000 64 64.0 known 776 910\n", "0.007812 0.015625 128 128.0 unknown 0 8294\n", "0.003906 0.000000 256 256.0 unknown 0 10504\n", "0.003906 0.003906 512 512.0 unknown 0 1248\n", "{803: 588, 776: 229, 707767: 94, 261716: 38, 575655: 21, 221653: 14, 2679365: 11, 18108: 4, 24680: 2}\n", "\n", "finished run\n", "number of examples = 1001\n", "weighted example sum = 1001.000000\n", "weighted label sum = 0.000000\n", "average loss = 0.008991\n", "average multiclass log loss = 0.449134\n", "total feature number = 2910232\n" ] }, { "data": { "text/plain": [ "253045" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "!pip install --user -e .\n", "# surface = 'madrid'\n", "surface = \"utrecht\"\n", "# wiki = 'simplewiki-20211120'\n", "wiki = \"nlwiki-20220301\"\n", "# wiki = 'eswiki-20220301'\n", "modelfile = f\"../data/wiki/{wiki}/experiments/clean-q0.25.24b.vw\"\n", "datafile = f\"../data/wiki/{wiki}/experiments/clean-q0.25.dat\"\n", "\n", "import subprocess\n", "\n", "args = [\"python\", \"-m\", \"minimel\", \"audit\", modelfile, datafile, surface]\n", "feats = set()\n", "with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=None) as process:\n", " for line in process.stdout:\n", " line = line.decode(\"utf8\").rstrip()\n", " if line.startswith(\"\\t\"):\n", " feats.update(set(f for f in line[1:].split(\"\\t\") if f.startswith(\"l^\")))\n", "len(feats)" ] }, { "cell_type": "code", "execution_count": 3, "id": "3c6ef0b1-a3d1-4abd-af9e-4d4887088d1c", "metadata": { "tags": [] }, "outputs": [], "source": [ "import io\n", "import pandas as pd\n", "\n", "df = pd.read_csv(\n", " io.StringIO(\"\\n\".join(feats)),\n", " sep=\"\\*|:|\\^|=\",\n", " header=None,\n", " engine=\"python\",\n", " usecols=[2, 4, 7],\n", " names=[\"wid\", \"feat\", \"weight\"],\n", ").dropna()\n", "\n", "# Normalize weights\n", "df[\"weight\"] = -(df[\"weight\"] - df.groupby(\"feat\")[\"weight\"].transform(\"mean\"))" ] }, { "cell_type": "code", "execution_count": 4, "id": "a768a0df-5a61-4478-b0d9-045ece31f532", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{776: 'Utrecht_(provincie)', 803: 'Utrecht_(stad)', 18108: 'Utrecht_(Zuid-Afrika)', 24680: 'FC_Utrecht', 221653: 'Universiteit_Utrecht', 261716: 'Aartsbisdom_Utrecht_(rooms-katholiek)', 575655: 'Station_Utrecht_Centraal', 707767: 'Sticht_Utrecht', 847384: 'Utrechts_Conservatorium', 2012748: 'Vechtsebanen', 2193594: 'Hr.Ms._Utrecht_(1901)', 2679365: 'Heerlijkheid_Utrecht', 85308316: 'BVC_Utrecht'}\n" ] } ], "source": [ "select_ents = set(df[\"wid\"].unique())\n", "\n", "import sqlite3\n", "\n", "con = sqlite3.connect(f\"../data/wiki/{wiki}/index_{wiki}.db\")\n", "ent_label = {}\n", "for e in select_ents:\n", " l = pd.read_sql_query(\n", " f'select * from mapping where wikidata_id=\"Q{e}\" limit 1', con\n", " )\n", " ent_label[e] = l[\"wikipedia_title\"][0]\n", "\n", "print(dict(sorted(ent_label.items())))" ] }, { "cell_type": "code", "execution_count": 5, "id": "24882acc-fd67-4c19-901c-401c7c7e4af5", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
  0123456789
Utrecht_(provincie)featprovinciewegverkeerstuurdebaarnvormentaalsamenspraakfilosofievakantieoordknutselencartoonist
weight1.98-1.26-1.070.870.86-0.860.840.830.80-0.80
Utrecht_(stad)featblijktutrechtprovinciebelangrijksteniedersächsischancstadmuseumverplegingsondeelbaar
weight-1.741.38-1.18-1.10-1.06-1.040.880.87-0.87-0.84
Utrecht_(Zuid-Afrika)featpiratenpionierbalmatfusieplanpolemiekenontariouitreikinguniversityannazalige
weight-1.19-1.11-0.89-0.83-0.800.79-0.760.760.75-0.75
FC_Utrechtfeatuniversiteitperksbrandlaagchartergeslagendraaitkrugersdorptemperaturekesselsbehandelen
weight1.03-1.01-0.840.81-0.740.71-0.700.66-0.66-0.65
Universiteit_Utrechtfeatordekaundaromanpersonageopgelegdworkvilainpestepidemiewetvoorafgegaanyntema
weight-1.300.92-0.87-0.84-0.82-0.820.80-0.78-0.720.70
\n" ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def topfeat(gr):\n", " gr = gr.drop(columns=\"wid\").set_index(\"feat\").dropna()\n", " gr = gr.loc[gr[\"weight\"].apply(\"abs\").sort_values().index[::-1]]\n", " return gr.head(10).reset_index()\n", "\n", "\n", "tops = df.groupby(\"wid\").apply(topfeat)\n", "tops = tops.swaplevel().unstack().swaplevel(axis=1).sort_index(axis=1).T\n", "tops.index = tops.index.set_levels(\n", " [ent_label[wid] for wid in tops.index.levels[0]], level=0\n", ")\n", "tops.index.names = (None, None)\n", "\n", "import seaborn as sns\n", "\n", "cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)\n", "tops.head(10).style.background_gradient(\n", " cmap=cmap, subset=pd.IndexSlice[pd.IndexSlice[:, \"weight\"], :]\n", ").format(precision=2)" ] }, { "cell_type": "code", "execution_count": null, "id": "c64767cd-92fb-49c6-85a4-c57218677dc7", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }