{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [00:11<00:00, 86.14it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "184022 descriptions\n"
     ]
    }
   ],
   "source": [
    "import tqdm\n",
    "import pathlib\n",
    "import re\n",
    "import json\n",
    "\n",
    "try:\n",
    "    import dawg\n",
    "except ImportError:\n",
    "    import dawg_python as dawg\n",
    "\n",
    "root = pathlib.Path('wiki/nlwiki-20220301')\n",
    "dawgfile = root / 'index_nlwiki-20220301.dawg'\n",
    "index = dawg.IntDAWG()\n",
    "index.load(str(dawgfile))\n",
    "\n",
    "candidates = json.load(open('wiki/nlwiki-20220301/experiments/clean-q1.json'))\n",
    "entities = set(int(c) for cs in candidates.values() for c in cs)\n",
    "\n",
    "brackets = re.compile('\\([^)]*\\) ?')\n",
    "phrase = re.compile('.{10,}?\\. ')\n",
    "\n",
    "descriptions = []\n",
    "files = list(pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/').glob('*'))\n",
    "for fname in tqdm.tqdm(files):\n",
    "    prev = None\n",
    "    for line in open(fname):\n",
    "        page, _, text = line.split('\\t', 2)\n",
    "        if page != prev:\n",
    "            text, _ = re.subn(brackets,'',text)\n",
    "            m = re.match(phrase, text)\n",
    "            if m:\n",
    "                a,b = m.span()\n",
    "                e = index.get(page)\n",
    "                if e in entities:\n",
    "                    descriptions.append((e,text[a:b]))\n",
    "            prev = page\n",
    "\n",
    "print(len(descriptions), 'descriptions')\n",
    "with open('wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv', 'w') as fw:\n",
    "    for e,d in descriptions:\n",
    "        print(e,d, sep='\\t', file=fw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50/50 [00:03<00:00, 14.29it/s]\n"
     ]
    }
   ],
   "source": [
    "import duckdb\n",
    "\n",
    "import pandas as pd\n",
    "import json\n",
    "import pathlib\n",
    "import tqdm\n",
    "\n",
    "try:\n",
    "    import dawg\n",
    "except ImportError:\n",
    "    import dawg_python as dawg\n",
    "\n",
    "root = pathlib.Path('wiki/nlwiki-20220301')\n",
    "dawgfile = root / 'index_nlwiki-20220301.dawg'\n",
    "index = dawg.IntDAWG()\n",
    "index.load(str(dawgfile))\n",
    "\n",
    "def loads(x):\n",
    "    try:\n",
    "        return json.loads(x)\n",
    "    except:\n",
    "        return {}\n",
    "\n",
    "root = pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/')\n",
    "n = 0\n",
    "texts, links = [], []\n",
    "for fname in tqdm.tqdm(list(root.glob('*'))[:50]):\n",
    "    df = pd.read_csv(fname, sep='\\t', on_bad_lines='skip', names=['page', 'links', 'text'])\n",
    "    df['page'] = df['page'].map(index.get).astype('Int64')\n",
    "    df.index += n\n",
    "    n += len(df)\n",
    "    pairs = df.links.map(lambda x: list(loads(x).items())).explode()\n",
    "    links.append( pd.DataFrame({'name':pairs.str[0], 'target':pairs.str[1]}) )\n",
    "    df.drop(columns=['links'], inplace=True)\n",
    "    texts.append( df )\n",
    "texts = pd.concat( texts ).reset_index()\n",
    "links = pd.concat( links ).reset_index()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>803</td>\n",
       "      <td>900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>776</td>\n",
       "      <td>174</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>707767</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>261716</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2677914</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>221653</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>575655</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2679365</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>347488</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>85308316</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     target    c\n",
       "0       803  900\n",
       "1       776  174\n",
       "2    707767   24\n",
       "3    261716   16\n",
       "4   2677914    5\n",
       "5    221653    5\n",
       "6    575655    4\n",
       "7   2679365    4\n",
       "8    347488    3\n",
       "9  85308316    2"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "duckdb.query(\n",
    "f\"\"\"\n",
    "select target, count(*) c from texts, links \n",
    "where texts.index=links.index and name == 'Utrecht'\n",
    "group by target\n",
    "order by c desc\n",
    "limit 10\n",
    "\"\"\").df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "803 Utrecht is een stad en gemeente in Nederland en de hoofdstad van de provincie Utrecht. \n",
      "werk                16.226999\n",
      "den haag            16.168381\n",
      "utrecht stad        16.106695\n",
      "haag                16.104145\n",
      "rotterdam           16.102599\n",
      "universiteit        16.101863\n",
      "hoogleraar          16.091765\n",
      "stad utrecht        16.040620\n",
      "museum              16.017313\n",
      "nederlandse stad    15.991953\n",
      "dtype: float64\n",
      "\n",
      "776 Utrecht is met een landoppervlakte van 1.485 km² de op een na kleinste provincie van Nederland. \n",
      "provincie utrecht                15.735772\n",
      "nederlandse provincie            15.461026\n",
      "nederlandse provincie utrecht    15.311498\n",
      "eemnes                           15.273796\n",
      "provincie                        15.219166\n",
      "eemnes provincie                 15.066386\n",
      "eemnes provincie utrecht         15.066386\n",
      "zuid holland                     14.828159\n",
      "noord holland                    14.789730\n",
      "holland utrecht                  14.663517\n",
      "dtype: float64\n",
      "\n",
      "707767 Het Sticht Utrecht was het territorium waarover de bisschoppen van Utrecht in de middeleeuwen als vorst de landsheerlijkheid uitoefenden. \n",
      "sticht utrecht      14.500452\n",
      "bisschop            14.426491\n",
      "sticht              14.397942\n",
      "bisschop utrecht    14.143663\n",
      "luik                12.177787\n",
      "ii                  12.118064\n",
      "graafschap          11.782087\n",
      "iv                  11.620562\n",
      "iii                 11.343168\n",
      "hendrik             11.151790\n",
      "dtype: float64\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import stopwordsiso as stopwords\n",
    "import numpy as np\n",
    "\n",
    "descfile = 'wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv'\n",
    "e_desc = pd.read_csv(descfile, sep='\\t', on_bad_lines='skip', header=None, index_col=0)[1].to_dict()\n",
    "\n",
    "# ents = [11775750, 2595790] # Openbaar Ministerie\n",
    "# ents = [29520, 148] # China\n",
    "ents = [803, 776, 707767] # Utrecht\n",
    "\n",
    "sample = duckdb.query(\n",
    "f\"\"\"\n",
    "select text, target from texts, links \n",
    "where texts.index=links.index and target in {str(ents)}\n",
    "\"\"\").df().drop_duplicates()\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "stop_words = list(stopwords.stopwords('nl'))\n",
    "\n",
    "tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=10, max_df=.75, stop_words=stop_words)\n",
    "X = tfidf.fit_transform(sample['text'])\n",
    "\n",
    "feat_weight = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))\n",
    "\n",
    "for e in ents:\n",
    "    print(e, e_desc[e])\n",
    "    wdocs = tfidf.inverse_transform(X[sample['target'] == e])\n",
    "    wcount = pd.Series(wdocs).explode().value_counts()\n",
    "    top_feats = pd.Series({\n",
    "        w:np.log1p(c)*feat_weight[w]\n",
    "        for w,c in wcount.to_dict().items()\n",
    "    }).sort_values()[::-1][:10]\n",
    "    print(top_feats)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "minimel",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}