{ "cells": [ { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1000/1000 [00:11<00:00, 86.14it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "184022 descriptions\n" ] } ], "source": [ "import tqdm\n", "import pathlib\n", "import re\n", "import json\n", "\n", "try:\n", " import dawg\n", "except ImportError:\n", " import dawg_python as dawg\n", "\n", "root = pathlib.Path('wiki/nlwiki-20220301')\n", "dawgfile = root / 'index_nlwiki-20220301.dawg'\n", "index = dawg.IntDAWG()\n", "index.load(str(dawgfile))\n", "\n", "candidates = json.load(open('wiki/nlwiki-20220301/experiments/clean-q1.json'))\n", "entities = set(int(c) for cs in candidates.values() for c in cs)\n", "\n", "brackets = re.compile('\\([^)]*\\) ?')\n", "phrase = re.compile('.{10,}?\\. ')\n", "\n", "descriptions = []\n", "files = list(pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/').glob('*'))\n", "for fname in tqdm.tqdm(files):\n", " prev = None\n", " for line in open(fname):\n", " page, _, text = line.split('\\t', 2)\n", " if page != prev:\n", " text, _ = re.subn(brackets,'',text)\n", " m = re.match(phrase, text)\n", " if m:\n", " a,b = m.span()\n", " e = index.get(page)\n", " if e in entities:\n", " descriptions.append((e,text[a:b]))\n", " prev = page\n", "\n", "print(len(descriptions), 'descriptions')\n", "with open('wiki/nlwiki-20220301/experiments/clean-q1-descriptions.tsv', 'w') as fw:\n", " for e,d in descriptions:\n", " print(e,d, sep='\\t', file=fw)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 50/50 [00:03<00:00, 14.29it/s]\n" ] } ], "source": [ "import duckdb\n", "\n", "import pandas as pd\n", "import json\n", "import pathlib\n", "import tqdm\n", "\n", "try:\n", " import dawg\n", "except ImportError:\n", " import dawg_python as dawg\n", "\n", "root = pathlib.Path('wiki/nlwiki-20220301')\n", "dawgfile = root / 'index_nlwiki-20220301.dawg'\n", "index = dawg.IntDAWG()\n", "index.load(str(dawgfile))\n", "\n", "def loads(x):\n", " try:\n", " return json.loads(x)\n", " except:\n", " return {}\n", "\n", "root = pathlib.Path('wiki/nlwiki-20220301/nlwiki-20220301-paragraph-links/')\n", "n = 0\n", "texts, links = [], []\n", "for fname in tqdm.tqdm(list(root.glob('*'))[:50]):\n", " df = pd.read_csv(fname, sep='\\t', on_bad_lines='skip', names=['page', 'links', 'text'])\n", " df['page'] = df['page'].map(index.get).astype('Int64')\n", " df.index += n\n", " n += len(df)\n", " pairs = df.links.map(lambda x: list(loads(x).items())).explode()\n", " links.append( pd.DataFrame({'name':pairs.str[0], 'target':pairs.str[1]}) )\n", " df.drop(columns=['links'], inplace=True)\n", " texts.append( df )\n", "texts = pd.concat( texts ).reset_index()\n", "links = pd.concat( links ).reset_index()\n" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | target | \n", "c | \n", "
|---|---|---|
| 0 | \n", "803 | \n", "900 | \n", "
| 1 | \n", "776 | \n", "174 | \n", "
| 2 | \n", "707767 | \n", "24 | \n", "
| 3 | \n", "261716 | \n", "16 | \n", "
| 4 | \n", "2677914 | \n", "5 | \n", "
| 5 | \n", "221653 | \n", "5 | \n", "
| 6 | \n", "575655 | \n", "4 | \n", "
| 7 | \n", "2679365 | \n", "4 | \n", "
| 8 | \n", "347488 | \n", "3 | \n", "
| 9 | \n", "85308316 | \n", "2 | \n", "