{ "cells": [ { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:11<00:00, 86.14it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "184022 descriptions\n" ] } ], "source": [ "import tqdm\n", "import pathlib\n", "import re\n", "import json\n", "\n", "try:\n", " import dawg\n", "except ImportError:\n", " import dawg_python as dawg\n", "\n", "root = pathlib.Path('wiki/nlwiki-20220301')\n", "dawgfile = root / 'index_nlwiki-20220301.dawg'\n", "index = dawg.IntDAWG()\n", "index.load(str(dawgfile))\n", "\n", "candidates = json.load(open('wiki/nlwiki-20220301/experiments/clean-q1.json'))\n", "entities = set(int(c) for cs in candidates.values() for c in cs)\n", "\n", "brackets = re.compile('\\([^)]*\\) ?')\n", "phrase = re.compile('.{10,}?\\. ')\n", "\n", "descriptions = []\n", "files = list(pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/').glob('*'))\n", "for fname in tqdm.tqdm(files):\n", " prev = None\n", " for line in open(fname):\n", " page, _, text = line.split('\\t', 2)\n", " if page != prev:\n", " text, _ = re.subn(brackets,'',text)\n", " m = re.match(phrase, text)\n", " if m:\n", " a,b = m.span()\n", " e = index.get(page)\n", " if e in entities:\n", " descriptions.append((e,text[a:b]))\n", " prev = page\n", "\n", "print(len(descriptions), 'descriptions')\n", "with open('wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv', 'w') as fw:\n", " for e,d in descriptions:\n", " print(e,d, sep='\\t', file=fw)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 50/50 [00:03<00:00, 14.29it/s]\n" ] } ], "source": [ "import duckdb\n", "\n", "import pandas as pd\n", "import json\n", "import pathlib\n", "import tqdm\n", "\n", "try:\n", " import dawg\n", "except ImportError:\n", " import dawg_python as dawg\n", "\n", "root = pathlib.Path('wiki/nlwiki-20220301')\n", "dawgfile = root / 'index_nlwiki-20220301.dawg'\n", "index = dawg.IntDAWG()\n", "index.load(str(dawgfile))\n", "\n", "def loads(x):\n", " try:\n", " return json.loads(x)\n", " except:\n", " return {}\n", "\n", "root = pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/')\n", "n = 0\n", "texts, links = [], []\n", "for fname in tqdm.tqdm(list(root.glob('*'))[:50]):\n", " df = pd.read_csv(fname, sep='\\t', on_bad_lines='skip', names=['page', 'links', 'text'])\n", " df['page'] = df['page'].map(index.get).astype('Int64')\n", " df.index += n\n", " n += len(df)\n", " pairs = df.links.map(lambda x: list(loads(x).items())).explode()\n", " links.append( pd.DataFrame({'name':pairs.str[0], 'target':pairs.str[1]}) )\n", " df.drop(columns=['links'], inplace=True)\n", " texts.append( df )\n", "texts = pd.concat( texts ).reset_index()\n", "links = pd.concat( links ).reset_index()\n" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
targetc
0803900
1776174
270776724
326171616
426779145
52216535
65756554
726793654
83474883
9853083162
\n", "
" ], "text/plain": [ " target c\n", "0 803 900\n", "1 776 174\n", "2 707767 24\n", "3 261716 16\n", "4 2677914 5\n", "5 221653 5\n", "6 575655 4\n", "7 2679365 4\n", "8 347488 3\n", "9 85308316 2" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "duckdb.query(\n", "f\"\"\"\n", "select target, count(*) c from texts, links \n", "where texts.index=links.index and name == 'Utrecht'\n", "group by target\n", "order by c desc\n", "limit 10\n", "\"\"\").df()" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "803 Utrecht is een stad en gemeente in Nederland en de hoofdstad van de provincie Utrecht. \n", "werk 16.226999\n", "den haag 16.168381\n", "utrecht stad 16.106695\n", "haag 16.104145\n", "rotterdam 16.102599\n", "universiteit 16.101863\n", "hoogleraar 16.091765\n", "stad utrecht 16.040620\n", "museum 16.017313\n", "nederlandse stad 15.991953\n", "dtype: float64\n", "\n", "776 Utrecht is met een landoppervlakte van 1.485 km² de op een na kleinste provincie van Nederland. \n", "provincie utrecht 15.735772\n", "nederlandse provincie 15.461026\n", "nederlandse provincie utrecht 15.311498\n", "eemnes 15.273796\n", "provincie 15.219166\n", "eemnes provincie 15.066386\n", "eemnes provincie utrecht 15.066386\n", "zuid holland 14.828159\n", "noord holland 14.789730\n", "holland utrecht 14.663517\n", "dtype: float64\n", "\n", "707767 Het Sticht Utrecht was het territorium waarover de bisschoppen van Utrecht in de middeleeuwen als vorst de landsheerlijkheid uitoefenden. \n", "sticht utrecht 14.500452\n", "bisschop 14.426491\n", "sticht 14.397942\n", "bisschop utrecht 14.143663\n", "luik 12.177787\n", "ii 12.118064\n", "graafschap 11.782087\n", "iv 11.620562\n", "iii 11.343168\n", "hendrik 11.151790\n", "dtype: float64\n", "\n" ] } ], "source": [ "import stopwordsiso as stopwords\n", "import numpy as np\n", "\n", "descfile = 'wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv'\n", "e_desc = pd.read_csv(descfile, sep='\\t', on_bad_lines='skip', header=None, index_col=0)[1].to_dict()\n", "\n", "# ents = [11775750, 2595790] # Openbaar Ministerie\n", "# ents = [29520, 148] # China\n", "ents = [803, 776, 707767] # Utrecht\n", "\n", "sample = duckdb.query(\n", "f\"\"\"\n", "select text, target from texts, links \n", "where texts.index=links.index and target in {str(ents)}\n", "\"\"\").df().drop_duplicates()\n", "\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "stop_words = list(stopwords.stopwords('nl'))\n", "\n", "tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=10, max_df=.75, stop_words=stop_words)\n", "X = tfidf.fit_transform(sample['text'])\n", "\n", "feat_weight = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))\n", "\n", "for e in ents:\n", " print(e, e_desc[e])\n", " wdocs = tfidf.inverse_transform(X[sample['target'] == e])\n", " wcount = pd.Series(wdocs).explode().value_counts()\n", " top_feats = pd.Series({\n", " w:np.log1p(c)*feat_weight[w]\n", " for w,c in wcount.to_dict().items()\n", " }).sort_values()[::-1][:10]\n", " print(top_feats)\n", " print()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "minimel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }