{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Getting Started\n", "\n", "First, install the library with extras to train models:\n", "```\n", "pip install -e git+https://github.com/bennokr/minimel.git#egg=minimel[train]\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-22 23:31:53,738 - wikimapper.download - INFO - Downloading [https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-page.sql.gz] to [wiki/iawiki-latest/iawiki-latest-page.sql.gz]\n", "2024-05-22 23:32:02,031 - wikimapper.download - INFO - Downloading [https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-page_props.sql.gz] to [wiki/iawiki-latest/iawiki-latest-page_props.sql.gz]\n", "2024-05-22 23:32:04,885 - wikimapper.download - INFO - Downloading [https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-redirect.sql.gz] to [wiki/iawiki-latest/iawiki-latest-redirect.sql.gz]\n", "2024-05-22 23:32:06,819 - wikimapper.processor - INFO - Creating index for [iawiki-latest] in [wiki/iawiki-latest/index_iawiki-latest.db]\n", "2024-05-22 23:32:06,822 - wikimapper.processor - INFO - Parsing pages dump\n", "2024-05-22 23:32:07,209 - wikimapper.processor - INFO - Creating database index on 'wikipedia_title'\n", "2024-05-22 23:32:07,237 - wikimapper.processor - INFO - Parsing page properties dump\n", "2024-05-22 23:32:07,529 - wikimapper.processor - INFO - Parsing redirects dump\n", "2024-05-22 23:32:07,591 - wikimapper.processor - INFO - Creating database index on 'wikidata_id'\n" ] } ], "source": [ "wiki = 'iawiki-latest' # use Interlingua language Wikipedia version to test\n", "root = 'wiki/' + wiki\n", "!mkdir -p $root\n", "!wikimapper download $wiki --dir $root\n", "outdb = f'{root}/index_{wiki}.db'\n", "!wikimapper create $wiki --dumpdir $root --target $outdb" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading mapping...: 100%|█████████████| 34570/34570 [00:00<00:00, 329200.66it/s]\n", "INFO:root:Building IntDAWG trie...\n", "INFO:root:Saving to wiki/iawiki-latest/index_iawiki-latest.dawg...\n" ] } ], "source": [ "!minimel -v index $outdb" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2024-05-22 23:35:13-- https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-pages-articles.xml.bz2\n", "Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.71\n", "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.71|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 10654986 (10M) [application/octet-stream]\n", "Saving to: ‘wiki/iawiki-latest/iawiki-latest-pages-articles.xml.bz2’\n", "\n", "iawiki-latest-pages 82%[===============> ] 8,37M 17,3KB/s in 4m 7s \n", "\n", "2024-05-22 23:39:26 (34,7 KB/s) - Connection closed at byte 8781489. Retrying.\n", "\n", "--2024-05-22 23:39:27-- (try: 2) https://dumps.wikimedia.org/iawiki/latest/iawiki-latest-pages-articles.xml.bz2\n", "Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.71|:443... connected.\n", "HTTP request sent, awaiting response... 206 Partial Content\n", "Length: 10654986 (10M), 1873497 (1,8M) remaining [application/octet-stream]\n", "Saving to: ‘wiki/iawiki-latest/iawiki-latest-pages-articles.xml.bz2’\n", "\n", "iawiki-latest-pages 100%[++++++++++++++++===>] 10,16M 404KB/s in 4,5s \n", "\n", "2024-05-22 23:39:34 (404 KB/s) - ‘wiki/iawiki-latest/iawiki-latest-pages-articles.xml.bz2’ saved [10654986/10654986]\n", "\n" ] } ], "source": [ "wikiname = wiki.split('-')[0]\n", "!wget -P $root https://dumps.wikimedia.org/$wikiname/latest/$wiki-pages-articles.xml.bz2\n", "!bunzip2 $root/$wiki-pages-articles.xml.bz2" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Finished in 30s################] | 100% Completed | 30.5s\u001b[2K\n", "INFO:root:Wrote 100 partitions\n" ] } ], "source": [ "dump = f'{root}/{wiki}-pages-articles.xml'\n", "dawg = f'{root}/index_{wiki}.dawg'\n", "!minimel -v get-paragraphs -n 100 $dump $dawg " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Writing to wiki/iawiki-latest/ents-disambig.txt\n" ] } ], "source": [ "lang = wiki.split('wiki')[0]\n", "disambigpages = f'{root}/ents-disambig.txt'\n", "!minimel -v query-pages $lang -o $disambigpages" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Extracting disambiguation links...\n", "INFO:root:Finished in 2s#################] | 100% Completed | 2.5s\u001b[2K\n", "INFO:root:Writing to wiki/iawiki-latest/disambig.json\n" ] } ], "source": [ "!minimel -v get-disambig -n 100 $dump $dawg $disambigpages" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Counting links...\n", "INFO:root:Finished in 6s#################] | 100% Completed | 6.8s\u001b[2K\n", "INFO:root:Got 32602 counts.\n", "INFO:root:Aggregating...\n", "INFO:root:Finished in 10s################] | 100% Completed | 10.5s\u001b[2K\n", "INFO:root:Writing to wiki/iawiki-latest/count.min2.json\n" ] } ], "source": [ "paragraphlinks = f'{root}/{wiki}-paragraph-links/'\n", "!minimel -v count $paragraphlinks" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Get Wikidata IDs for disambiguation and list articles\n", "badent = f'{root}/badent.txt'\n", "!minimel query-pages $lang -q -o $badent" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Counting entities...: 100%|███████████| 11560/11560 [00:00<00:00, 178917.02it/s]\n", "INFO:root:Removing 133 bad entities\n", "Loading labels...: 100%|███████████████| 34570/34570 [00:00<00:00, 97792.93it/s]\n", "Filtering names...: 100%|██████████████| 11498/11498 [00:00<00:00, 17444.66it/s]\n", "INFO:root:Filtering out 1 bad names\n", "INFO:root:Keeping 11497 good names\n", "INFO:root:Writing to wiki/iawiki-latest/clean.json\n" ] } ], "source": [ "disambigfile = f'{root}/disambig.json'\n", "countfile = f'{root}/count.min2.json'\n", "!minimel -v clean -b $badent $outdb $disambigfile $countfile" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Vectorizing training examples for 286 ambiguous names\n", "INFO:root:Writing to wiki/iawiki-latest/vec.clean.dat.parts\n", "INFO:root:Finished in 3s#################] | 100% Completed | 3.4s\u001b[2K\n", "INFO:root:Wrote 34 partitions\n", "INFO:root:Concatenating to wiki/iawiki-latest/vec.clean.dat\n", "Concatenating: 100%|██████████████████████████| 34/34 [00:00<00:00, 3840.94it/s]\n" ] } ], "source": [ "cleanfile = f'{root}/clean.json'\n", "!minimel -v vectorize $paragraphlinks $cleanfile" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Writing to wiki/iawiki-latest/model.20b.vw\n", "creating quadratic features for pairs: ls sf\n", "final_regressor = wiki/iawiki-latest/model.20b.vw\n", "creating cache_file = wiki/iawiki-latest/vec.clean.dat.cache\n", "Reading datafile = wiki/iawiki-latest/vec.clean.dat\n", "num sources = 1\n", "Num weight bits = 20\n", "learning rate = 0.5\n", "initial_t = 0\n", "power_t = 0.5\n", "decay_learning_rate = 1\n", "Enabled learners: gd, scorer-identity, csoaa_ldf-prob, shared_feature_merger\n", "Input label = CS\n", "Output pred = SCALARS\n", "average since example example current current current\n", "loss last counter weight label predict features\n", "0.000000 0.000000 1 1.0 unknown 0 1414\n", "0.000000 0.000000 2 2.0 unknown 0 24\n", "0.000000 0.000000 4 4.0 unknown 0 348\n", "0.000000 0.000000 8 8.0 unknown 0 12\n", "0.125000 0.250000 16 16.0 unknown 0 188\n", "0.093750 0.062500 32 32.0 unknown 0 100\n", "0.046875 0.000000 64 64.0 unknown 0 108\n", "0.039062 0.031250 128 128.0 unknown 0 72\n", "0.031250 0.023438 256 256.0 unknown 0 552\n", "0.064453 0.097656 512 512.0 unknown 0 88\n", "0.102539 0.140625 1024 1024.0 known 41 150\n", "0.092773 0.083008 2048 2048.0 unknown 0 104\n", "0.104248 0.115723 4096 4096.0 known 37922 136\n", "0.115845 0.127441 8192 8192.0 unknown 0 348\n", "unknown unknown 16384 16384.0 unknown 0 24 h\n", "unknown unknown 32768 32768.0 unknown 0 108 h\n", "\n", "finished run\n", "number of examples per pass = 10250\n", "passes used = 4\n", "weighted example sum = 41000.000000\n", "weighted label sum = 0.000000\n", "average loss = undefined (no holdout)\n", "average multiclass log loss = 0.394560 h\n", "total feature number = 15863608\n", "INFO:root:Wrote to model.20b.vw\n" ] } ], "source": [ "vecfile = f'{root}/vec.clean.dat'\n", "!minimel -v train $vecfile" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predicting: 100%|███████████████████████| 59765/59765 [00:17<00:00, 3471.64it/s]\n", "INFO:root:,,0\n", "micro,precision,0.909326061550448\n", "micro,recall,0.909326061550448\n", "micro,fscore,0.909326061550448\n", "macro,precision,0.9236526246023489\n", "macro,recall,0.9062367026135526\n", "macro,fscore,0.9121998587060755\n", ",support,192525.0\n", "\n" ] } ], "source": [ "modelfile = f'{root}/model.20b.vw'\n", "!minimel -v run --evaluate -o /dev/null $dawg $cleanfile $modelfile $paragraphlinks/*" ] } ], "metadata": { "kernelspec": { "display_name": "minimel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 2 }