{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Running experiments\n", "Run cross-validation using `--fold` and `--split`" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Running experiments for wiki/iawiki-latest, outputs in wiki/iawiki-latest\n", "INFO:root:Using wiki/iawiki-latest/index_iawiki-latest.dawg\n", "INFO:root:Using wiki/iawiki-latest/ents-disambig.txt\n", "INFO:root:Using wiki/iawiki-latest/disambig.json\n", "INFO:root:Using wiki/iawiki-latest/iawiki-latest-paragraph-links\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': ('',), 'min_count': (2,), 'split': [5], 'fold': [1, 2]}\n", "INFO:root:Sweeping parameters {'stem': [None], 'min_count': [2], 'freqnorm': (False,), 'badentfile': ('',), 'tokenscore_threshold': (0.1,), 'entropy_threshold': (1.0,), 'countratio_threshold': (0.5,), 'quantile_top_shadowed': (0,), 'cluster_threshold': [0.5]}\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [1]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [1]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 18242.27it/s]\n", "INFO:root:micro precision 0.874295\n", " recall 0.874295\n", " fscore 0.874295\n", "macro precision 0.824055\n", " recall 0.803768\n", " fscore 0.809093\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 14345.83it/s]\n", "INFO:root:micro precision 0.877799\n", " recall 0.877799\n", " fscore 0.877799\n", "macro precision 0.827748\n", " recall 0.807085\n", " fscore 0.812927\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [1]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [1]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 17616.30it/s]\n", "INFO:root:micro precision 0.874295\n", " recall 0.874295\n", " fscore 0.874295\n", "macro precision 0.824055\n", " recall 0.803768\n", " fscore 0.809093\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11972/11972 [00:00<00:00, 13261.47it/s]\n", "INFO:root:micro precision 0.878134\n", " recall 0.878134\n", " fscore 0.878134\n", "macro precision 0.827807\n", " recall 0.807175\n", " fscore 0.813038\n", " support 38805.000000\n", "dtype: float64\n", "INFO:root:Sweeping parameters {'stem': [None], 'min_count': [2], 'freqnorm': (False,), 'badentfile': ('',), 'tokenscore_threshold': (0.1,), 'entropy_threshold': (1.0,), 'countratio_threshold': (0.5,), 'quantile_top_shadowed': (0,), 'cluster_threshold': [0.5]}\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [2]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [2]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:00<00:00, 25108.09it/s]\n", "INFO:root:micro precision 0.895722\n", " recall 0.895722\n", " fscore 0.895722\n", "macro precision 0.877247\n", " recall 0.861811\n", " fscore 0.865458\n", " support 38407.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:01<00:00, 11525.34it/s]\n", "INFO:root:micro precision 0.898690\n", " recall 0.898690\n", " fscore 0.898690\n", "macro precision 0.880691\n", " recall 0.865038\n", " fscore 0.869222\n", " support 38407.000000\n", "dtype: float64\n", "INFO:root:Sweeping parameters {'head': [None], 'stem': [None], 'vectorizer': ('',), 'ent_feats_csv': ('',), 'balanced': (False,), 'usenil': (False,), 'split': [5], 'fold': [2]}\n", "INFO:root:Sweeping parameters {'bits': (20,)}\n", "INFO:root:Sweeping parameters {'runfile': [PosixPath('wiki/iawiki-latest/iawiki-latest-paragraph-links')], 'use_fallback': (True,), 'split': [5], 'fold': [2]}\n", "INFO:root:Running baseline...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:00<00:00, 23652.54it/s]\n", "INFO:root:micro precision 0.895722\n", " recall 0.895722\n", " fscore 0.895722\n", "macro precision 0.877247\n", " recall 0.861811\n", " fscore 0.865458\n", " support 38407.000000\n", "dtype: float64\n", "INFO:root:Running model...\n", "Predicting: 100%|██████████████████████| 11957/11957 [00:01<00:00, 11056.60it/s]\n", "INFO:root:micro precision 0.898951\n", " recall 0.898951\n", " fscore 0.898951\n", "macro precision 0.880854\n", " recall 0.865183\n", " fscore 0.869402\n", " support 38407.000000\n", "dtype: float64\n" ] } ], "source": [ "!minimel -v experiment 'wiki/iawiki-latest' --fold 1 2 --split 5 --cluster-threshold 0.5 --evaluate" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | model | \n", "count.min_count | \n", "count.split | \n", "count.fold | \n", "clean.min_count | \n", "clean.tokenscore_threshold | \n", "clean.entropy_threshold | \n", "clean.countratio_threshold | \n", "clean.cluster_threshold | \n", "vec.split | \n", "... | \n", "run.split | \n", "run.fold | \n", "run.fallback | \n", "micro.precision | \n", "micro.recall | \n", "micro.fscore | \n", "macro.precision | \n", "macro.recall | \n", "macro.fscore | \n", ".support | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "baseline | \n", "2 | \n", "5 | \n", "1 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.25 | \n", "5 | \n", "... | \n", "5 | \n", "1 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.874295 | \n", "0.874295 | \n", "0.874295 | \n", "0.824055 | \n", "0.803768 | \n", "0.809093 | \n", "38805.0 | \n", "
| 1 | \n", "model | \n", "2 | \n", "5 | \n", "1 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.25 | \n", "5 | \n", "... | \n", "5 | \n", "1 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.877799 | \n", "0.877799 | \n", "0.877799 | \n", "0.827748 | \n", "0.807085 | \n", "0.812927 | \n", "38805.0 | \n", "
| 2 | \n", "baseline | \n", "2 | \n", "5 | \n", "1 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.50 | \n", "5 | \n", "... | \n", "5 | \n", "1 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.874295 | \n", "0.874295 | \n", "0.874295 | \n", "0.824055 | \n", "0.803768 | \n", "0.809093 | \n", "38805.0 | \n", "
| 3 | \n", "model | \n", "2 | \n", "5 | \n", "1 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.50 | \n", "5 | \n", "... | \n", "5 | \n", "1 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.878134 | \n", "0.878134 | \n", "0.878134 | \n", "0.827807 | \n", "0.807175 | \n", "0.813038 | \n", "38805.0 | \n", "
| 4 | \n", "baseline | \n", "2 | \n", "5 | \n", "2 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.25 | \n", "5 | \n", "... | \n", "5 | \n", "2 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.895722 | \n", "0.895722 | \n", "0.895722 | \n", "0.877247 | \n", "0.861811 | \n", "0.865458 | \n", "38407.0 | \n", "
| 5 | \n", "model | \n", "2 | \n", "5 | \n", "2 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.25 | \n", "5 | \n", "... | \n", "5 | \n", "2 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.898690 | \n", "0.898690 | \n", "0.898690 | \n", "0.880691 | \n", "0.865038 | \n", "0.869222 | \n", "38407.0 | \n", "
| 6 | \n", "baseline | \n", "2 | \n", "5 | \n", "2 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.50 | \n", "5 | \n", "... | \n", "5 | \n", "2 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.895722 | \n", "0.895722 | \n", "0.895722 | \n", "0.877247 | \n", "0.861811 | \n", "0.865458 | \n", "38407.0 | \n", "
| 7 | \n", "model | \n", "2 | \n", "5 | \n", "2 | \n", "2 | \n", "0.1 | \n", "1.0 | \n", "0.5 | \n", "0.50 | \n", "5 | \n", "... | \n", "5 | \n", "2 | \n", "/Users/benno/Documents/postdoc/projects/minima... | \n", "0.898951 | \n", "0.898951 | \n", "0.898951 | \n", "0.880854 | \n", "0.865183 | \n", "0.869402 | \n", "38407.0 | \n", "
8 rows × 23 columns
\n", "