{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Running experiments\n", "Run cross-validation using `--fold` and `--split`" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Running experiments for wiki/iawiki-latest, outputs in wiki/iawiki-latest\n", "INFO:root:Using wiki/iawiki-latest/index_iawiki-latest.dawg\n", "INFO:root:Using wiki/iawiki-latest/ents-disambig.txt\n", "INFO:root:Using wiki/iawiki-latest/disambig.json\n", "INFO:root:Using wiki/iawiki-latest/iawiki-latest-paragraph-links\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': ('',), 'min_count': (2,), 'split': [5], 'fold': [1, 2]}\n", "INFO:root:Sweeping parameters {'stem': [None], 'min_count': [2], 'freqnorm': (False,), 'badentfile': ('',), 'tokenscore_threshold': (0.1,), 'entropy_threshold': (1.0,), 'countratio_threshold': (0.5,), 'quantile_top_shadowed': (0,), 'cluster_threshold': [0.5]}\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [1]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [1]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 18242.27it/s]\n", "INFO:root:micro precision 0.874295\n", " recall 0.874295\n", " fscore 0.874295\n", "macro precision 0.824055\n", " recall 0.803768\n", " fscore 0.809093\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 14345.83it/s]\n", "INFO:root:micro precision 0.877799\n", " recall 0.877799\n", " fscore 0.877799\n", "macro precision 0.827748\n", " recall 0.807085\n", " fscore 0.812927\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [1]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [1]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 17616.30it/s]\n", "INFO:root:micro precision 0.874295\n", " recall 0.874295\n", " fscore 0.874295\n", "macro precision 0.824055\n", " recall 0.803768\n", " fscore 0.809093\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 13261.47it/s]\n", "INFO:root:micro precision 0.878134\n", " recall 0.878134\n", " fscore 0.878134\n", "macro precision 0.827807\n", " recall 0.807175\n", " fscore 0.813038\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Sweeping parameters {'stem': [None], 'min_count': [2], 'freqnorm': (False,), 'badentfile': ('',), 'tokenscore_threshold': (0.1,), 'entropy_threshold': (1.0,), 'countratio_threshold': (0.5,), 'quantile_top_shadowed': (0,), 'cluster_threshold': [0.5]}\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [2]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [2]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:00<00:00, 25108.09it/s]\n", "INFO:root:micro precision 0.895722\n", " recall 0.895722\n", " fscore 0.895722\n", "macro precision 0.877247\n", " recall 0.861811\n", " fscore 0.865458\n", " support 38407.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:01<00:00, 11525.34it/s]\n", "INFO:root:micro precision 0.898690\n", " recall 0.898690\n", " fscore 0.898690\n", "macro precision 0.880691\n", " recall 0.865038\n", " fscore 0.869222\n", " support 38407.000000\n", "dtype: float64\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [2]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [2]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:00<00:00, 23652.54it/s]\n", "INFO:root:micro precision 0.895722\n", " recall 0.895722\n", " fscore 0.895722\n", "macro precision 0.877247\n", " recall 0.861811\n", " fscore 0.865458\n", " support 38407.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:01<00:00, 11056.60it/s]\n", "INFO:root:micro precision 0.898951\n", " recall 0.898951\n", " fscore 0.898951\n", "macro precision 0.880854\n", " recall 0.865183\n", " fscore 0.869402\n", " support 38407.000000\n", "dtype: float64\n" ] } ], "source": [ "!minimel -v experiment 'wiki/iawiki-latest' --fold 1 2 --split 5 --cluster-threshold 0.5 --evaluate" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
modelcount.min_countcount.splitcount.foldclean.min_countclean.tokenscore_thresholdclean.entropy_thresholdclean.countratio_thresholdclean.cluster_thresholdvec.split...run.splitrun.foldrun.fallbackmicro.precisionmicro.recallmicro.fscoremacro.precisionmacro.recallmacro.fscore.support
0baseline25120.11.00.50.255...51/Users/benno/Documents/postdoc/projects/minima...0.8742950.8742950.8742950.8240550.8037680.80909338805.0
1model25120.11.00.50.255...51/Users/benno/Documents/postdoc/projects/minima...0.8777990.8777990.8777990.8277480.8070850.81292738805.0
2baseline25120.11.00.50.505...51/Users/benno/Documents/postdoc/projects/minima...0.8742950.8742950.8742950.8240550.8037680.80909338805.0
3model25120.11.00.50.505...51/Users/benno/Documents/postdoc/projects/minima...0.8781340.8781340.8781340.8278070.8071750.81303838805.0
4baseline25220.11.00.50.255...52/Users/benno/Documents/postdoc/projects/minima...0.8957220.8957220.8957220.8772470.8618110.86545838407.0
5model25220.11.00.50.255...52/Users/benno/Documents/postdoc/projects/minima...0.8986900.8986900.8986900.8806910.8650380.86922238407.0
6baseline25220.11.00.50.505...52/Users/benno/Documents/postdoc/projects/minima...0.8957220.8957220.8957220.8772470.8618110.86545838407.0
7model25220.11.00.50.505...52/Users/benno/Documents/postdoc/projects/minima...0.8989510.8989510.8989510.8808540.8651830.86940238407.0
\n", "

8 rows × 23 columns

\n", "
" ], "text/plain": [ " model count.min_count count.split count.fold clean.min_count \\\n", "0 baseline 2 5 1 2 \n", "1 model 2 5 1 2 \n", "2 baseline 2 5 1 2 \n", "3 model 2 5 1 2 \n", "4 baseline 2 5 2 2 \n", "5 model 2 5 2 2 \n", "6 baseline 2 5 2 2 \n", "7 model 2 5 2 2 \n", "\n", " clean.tokenscore_threshold clean.entropy_threshold \\\n", "0 0.1 1.0 \n", "1 0.1 1.0 \n", "2 0.1 1.0 \n", "3 0.1 1.0 \n", "4 0.1 1.0 \n", "5 0.1 1.0 \n", "6 0.1 1.0 \n", "7 0.1 1.0 \n", "\n", " clean.countratio_threshold clean.cluster_threshold vec.split ... \\\n", "0 0.5 0.25 5 ... \n", "1 0.5 0.25 5 ... \n", "2 0.5 0.50 5 ... \n", "3 0.5 0.50 5 ... \n", "4 0.5 0.25 5 ... \n", "5 0.5 0.25 5 ... \n", "6 0.5 0.50 5 ... \n", "7 0.5 0.50 5 ... \n", "\n", " run.split run.fold run.fallback \\\n", "0 5 1 /Users/benno/Documents/postdoc/projects/minima... \n", "1 5 1 /Users/benno/Documents/postdoc/projects/minima... \n", "2 5 1 /Users/benno/Documents/postdoc/projects/minima... \n", "3 5 1 /Users/benno/Documents/postdoc/projects/minima... \n", "4 5 2 /Users/benno/Documents/postdoc/projects/minima... \n", "5 5 2 /Users/benno/Documents/postdoc/projects/minima... \n", "6 5 2 /Users/benno/Documents/postdoc/projects/minima... \n", "7 5 2 /Users/benno/Documents/postdoc/projects/minima... \n", "\n", " micro.precision micro.recall micro.fscore macro.precision macro.recall \\\n", "0 0.874295 0.874295 0.874295 0.824055 0.803768 \n", "1 0.877799 0.877799 0.877799 0.827748 0.807085 \n", "2 0.874295 0.874295 0.874295 0.824055 0.803768 \n", "3 0.878134 0.878134 0.878134 0.827807 0.807175 \n", "4 0.895722 0.895722 0.895722 0.877247 0.861811 \n", "5 0.898690 0.898690 0.898690 0.880691 0.865038 \n", "6 0.895722 0.895722 0.895722 0.877247 0.861811 \n", "7 0.898951 0.898951 0.898951 0.880854 0.865183 \n", "\n", " macro.fscore .support \n", "0 0.809093 38805.0 \n", "1 0.812927 38805.0 \n", "2 0.809093 38805.0 \n", "3 0.813038 38805.0 \n", "4 0.865458 38407.0 \n", "5 0.869222 38407.0 \n", "6 0.865458 38407.0 \n", "7 0.869402 38407.0 \n", "\n", "[8 rows x 23 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "pd.read_csv('wiki/iawiki-latest/evaluation.csv', index_col=0)" ] } ], "metadata": { "kernelspec": { "display_name": "minimel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 2 }