{
"cells": [
{
"cell_type": "markdown",
"id": "9b16b625-0856-42fa-b61f-f14ae31eab55",
"metadata": {},
"source": [
"# Surfaceform clusters"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c9dacaf9-0413-468a-bd9f-d86e5d7123fd",
"metadata": {},
"outputs": [],
"source": [
"import json, math, tqdm\n",
"\n",
"fname = \"wiki/nlwiki-20220301/experiments/clean-q0.25.json\"\n",
"anchor_scores = json.load(open(fname))\n",
"# log transform\n",
"anchor_scores = {\n",
" a: {e: math.log1p(c) for e, c in ec.items()} for a, ec in anchor_scores.items()\n",
"}\n",
"# l2 normalize\n",
"anchor_scores = {\n",
" a: {\n",
" e: c / t\n",
" for t in [sum(v**2 for v in ec.values()) ** 0.5]\n",
" for e, c in ec.items()\n",
" }\n",
" for a, ec in anchor_scores.items()\n",
"}\n",
"len(anchor_scores)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a90af9e2-62e3-4217-a748-4861b70be7fa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30472/30472 [00:00<00:00, 131802.04it/s]\n"
]
},
{
"data": {
"text/plain": [
"(1900, 4902)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_threshold = 0.5\n",
"\n",
"id_anchors = {}\n",
"for a, es in anchor_scores.items():\n",
" for e in es:\n",
" id_anchors.setdefault(e, set()).add(a)\n",
"\n",
"surface_cluster = {i: i for i in anchor_scores}\n",
"for a, es in tqdm.tqdm(anchor_scores.items()):\n",
" others = set.union(*[id_anchors[e] for e in es]) - set([a])\n",
" for o in others:\n",
" x, y = set(anchor_scores[o]), set(es)\n",
" # score = len(x&y) / len(x|y) # jacc\n",
" score = sum(anchor_scores[o][v] * es[v] for v in x & y) # cosine\n",
" if score > score_threshold:\n",
" surface_cluster[o] = surface_cluster[a]\n",
" # print(f'{a:20s} {o:20s}', score )\n",
"\n",
"clusters = {}\n",
"for s, c in surface_cluster.items():\n",
" clusters.setdefault(c, set()).add(s)\n",
"clusters = [ss for ss in clusters.values() if len(ss) > 1]\n",
"len(clusters), len(set(s for c in clusters for s in c))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a58df8c2-cb67-41cc-b8fa-5c3a52c717e8",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[{'cannondale pro cycling team',\n",
" 'cannondale-drapac pro cycling team',\n",
" 'ef education first pro cycling',\n",
" 'ef education-easypost',\n",
" 'ef education-nippo',\n",
" 'garmin',\n",
" 'garmin sharp',\n",
" 'garmin-sharp',\n",
" 'garmin-slipstream',\n",
" 'garmin-transitions',\n",
" 'team cannondale-garmin',\n",
" 'team ef education first-drapac p/b cannondale',\n",
" 'team garmin-cervélo',\n",
" 'team garmin-sharp',\n",
" 'team garmin-transitions'},\n",
" {'amateurvoetbalclub',\n",
" 'amateurvoetbalvereniging',\n",
" 'bekertoernooi',\n",
" 'clubs',\n",
" 'profvoetbalclub',\n",
" 'profvoetballer',\n",
" 'voetbal',\n",
" 'voetbalbeker',\n",
" 'voetbalbond',\n",
" 'voetbalclub',\n",
" 'voetballer',\n",
" 'voetballers',\n",
" 'voetbalster',\n",
" 'voetbalvereniging'},\n",
" {'frankrijk',\n",
" 'franse',\n",
" 'franse republiek',\n",
" 'franse revolutionaire',\n",
" 'fransen',\n",
" 'fransman',\n",
" 'noord-frankrijk',\n",
" 'noord-franse',\n",
" 'revolutionaire',\n",
" 'revolutionaire frankrijk',\n",
" 'revolutionairen',\n",
" 'zuid-frankrijk',\n",
" 'zuid-franse'},\n",
" {'amerika',\n",
" 'amerikaanse',\n",
" 'amerikaanse staat',\n",
" 'amerikaanse staten',\n",
" 'amerikanen',\n",
" 'de verenigde staten',\n",
" 'u.s.',\n",
" 'united states',\n",
" 'usa',\n",
" 'verenigde staten',\n",
" 'vs'},\n",
" {'brit',\n",
" 'brits',\n",
" 'britse',\n",
" 'britten',\n",
" 'engeland',\n",
" 'engelse',\n",
" 'engelsen',\n",
" 'groot-brittannië',\n",
" 'uk',\n",
" 'verenigd koninkrijk',\n",
" 'vk'},\n",
" {'bondsrepubliek',\n",
" 'bondsrepubliek duitsland',\n",
" 'duits',\n",
" 'duits voetbalelftal',\n",
" 'duitse',\n",
" 'duitse afkomst',\n",
" 'duitser',\n",
" 'duitsers',\n",
" 'nationale elftal',\n",
" 'west-duitse',\n",
" 'west-duitsland'},\n",
" {'componeerde',\n",
" 'componeren',\n",
" 'componist',\n",
" 'compositie',\n",
" 'composities',\n",
" 'gecomponeerd',\n",
" 'liederen',\n",
" 'nummer',\n",
" 'nummers',\n",
" 'track'},\n",
" {'de oost',\n",
" 'india',\n",
" 'indiase',\n",
" 'indisch',\n",
" 'indische',\n",
" 'indië',\n",
" 'indiërs',\n",
" 'indo',\n",
" 'nederlands-indië',\n",
" 'oost-indië'},\n",
" {'katholicisme',\n",
" 'katholiek',\n",
" 'katholieke',\n",
" 'katholieke geloof',\n",
" 'katholieke kerk',\n",
" 'katholieken',\n",
" 'kerk',\n",
" 'kerkelijke',\n",
" 'rooms-katholiek',\n",
" 'rooms-katholieke'},\n",
" {'rome',\n",
" 'romein',\n",
" 'romeinen',\n",
" 'romeins',\n",
" 'romeinse',\n",
" 'romeinse keizerrijk',\n",
" 'romeinse keizertijd',\n",
" 'romeinse oudheid',\n",
" 'romeinse periode',\n",
" 'romeinse tijd'},\n",
" {'olympische spelen 2020',\n",
" 'olympische spelen in tokio',\n",
" 'olympische spelen van 1964',\n",
" 'olympische spelen van tokio',\n",
" 'olympische zomerspelen 1964',\n",
" 'olympische zomerspelen 2020',\n",
" 'os 2020',\n",
" 'tokio 2020',\n",
" 'zomerspelen van 1964'},\n",
" {'deceuninck',\n",
" 'deceuninck–quick-step',\n",
" 'etixx-quick step',\n",
" 'omega pharma-quick step',\n",
" 'quick step',\n",
" 'quick step-alpha vinyl',\n",
" 'quick-step',\n",
" 'quick-step floors',\n",
" 'quickstep'},\n",
" {'csc',\n",
" 'saxo bank',\n",
" 'saxo bank-sungard',\n",
" 'team csc',\n",
" 'team csc saxo bank',\n",
" 'team saxo bank',\n",
" 'team saxo-tinkoff',\n",
" 'tinkoff',\n",
" 'tinkoff-saxo'},\n",
" {'koloniaal',\n",
" 'koloniale',\n",
" 'kolonialisme',\n",
" 'kolonie',\n",
" 'kolonies',\n",
" 'kolonist',\n",
" 'kolonisten',\n",
" 'koloniën',\n",
" 'pionier'},\n",
" {'bezetting',\n",
" 'duitse bezetter',\n",
" 'duitse bezetters',\n",
" 'duitse bezetting',\n",
" 'nazi',\n",
" 'naziregime',\n",
" 'nazis',\n",
" 'oorlogsjaren',\n",
" 'tweede wereldoorlog'},\n",
" {'aanval',\n",
" 'aanvaller',\n",
" 'av',\n",
" 'centrumspits',\n",
" 'linksbuiten',\n",
" 'rechtsbuiten',\n",
" 'schaduwspits',\n",
" 'spits',\n",
" 'vleugelaanvaller'},\n",
" {'gr.',\n",
" 'grieken',\n",
" 'grieks',\n",
" 'griekse',\n",
" 'griekse oudheid',\n",
" 'griekse taal',\n",
" 'oud-griekse',\n",
" 'oude grieken',\n",
" 'oudgrieks'},\n",
" {'italiaan',\n",
" 'italiaans',\n",
" 'italiaanse',\n",
" 'italiaanse republiek',\n",
" 'italianen',\n",
" 'noord-italiaanse',\n",
" 'noord-italië',\n",
" 'zuid-italiaanse',\n",
" 'zuid-italië'},\n",
" {'lampre',\n",
" 'lampre-caffita',\n",
" 'lampre-farnese vini',\n",
" 'lampre-fondital',\n",
" 'lampre-isd',\n",
" 'lampre-merida',\n",
" 'lampre-ngc',\n",
" 'uae team emirates'},\n",
" {'cd',\n",
" 'cd-single',\n",
" 'cds',\n",
" 'debuut',\n",
" 'debuutsingle',\n",
" 'single',\n",
" 'singles',\n",
" 'vinylsingle'},\n",
" {'argos-shimano',\n",
" 'bankgiroloterij',\n",
" 'giant-shimano',\n",
" 'skil-shimano',\n",
" 'team dsm',\n",
" 'team giant-alpecin',\n",
" 'team giant-shimano',\n",
" 'team sunweb'},\n",
" {'federaal',\n",
" 'federale',\n",
" 'federale overheid',\n",
" 'federale republiek',\n",
" 'federalist',\n",
" 'federalisten',\n",
" 'federalistische',\n",
" 'federatie'},\n",
" {'ultratop',\n",
" 'ultratop 200',\n",
" 'ultratop 50',\n",
" 'vlaams',\n",
" 'vlaamse',\n",
" 'vlaanderen',\n",
" 'vlaming',\n",
" 'vlamingen'},\n",
" {'beroepswielrenner',\n",
" 'weg',\n",
" 'wielerploeg',\n",
" 'wielersport',\n",
" 'wielerwedstrijd',\n",
" 'wielrennen',\n",
" 'wielrenner',\n",
" 'wielrenster'},\n",
" {'bonjour',\n",
" 'bouygues télécom',\n",
" 'direct énergie',\n",
" 'europcar',\n",
" 'team europcar',\n",
" 'team totalenergies',\n",
" 'total direct energie'},\n",
" {'fdj',\n",
" 'fdj-bigmat',\n",
" 'fdj.fr',\n",
" 'fdjeux.com',\n",
" 'française des jeux',\n",
" 'groupama-fdj',\n",
" 'la française des jeux'},\n",
" {'academici',\n",
" 'academicus',\n",
" 'academisch',\n",
" 'academische',\n",
" 'universiteit',\n",
" 'universiteiten',\n",
" 'wo'},\n",
" {'golfbaan',\n",
" 'golfclub',\n",
" 'golfclubs',\n",
" 'golfer',\n",
" 'golfprofessional',\n",
" 'golfterrein',\n",
" 'professional'},\n",
" {'londen 2012',\n",
" 'olympische spelen 2012',\n",
" 'olympische spelen in londen',\n",
" 'olympische spelen van 2012',\n",
" 'olympische spelen van londen',\n",
" 'olympische zomerspelen 2012',\n",
" 'olympische zomerspelen van 2012'},\n",
" {'koninkrijk',\n",
" 'koninkrijken',\n",
" 'monarchie',\n",
" 'monarchist',\n",
" 'monarchisten',\n",
" 'monarchistische',\n",
" 'royalisten'},\n",
" {'congo',\n",
" 'congo-brazzaville',\n",
" 'congo-kinshasa',\n",
" 'congolees',\n",
" 'congolese',\n",
" 'kongo',\n",
" 'republiek congo'},\n",
" {'linkervleugelverdediger',\n",
" 'linksachter',\n",
" 'linksback',\n",
" 'rechtervleugelverdediger',\n",
" 'rechtsachter',\n",
" 'rechtsback',\n",
" 'vleugelverdediger'},\n",
" {'kamer',\n",
" 'kamerlid',\n",
" 'kamerverkiezingen',\n",
" 'tweede kamerfractie',\n",
" 'tweede kamerlid',\n",
" 'verkiezingen van 2014',\n",
" 'verkiezingen van 25 mei 2014'},\n",
" {'dagvlinder',\n",
" 'mot',\n",
" 'motten',\n",
" 'nachtvlinder',\n",
" 'nachtvlinders',\n",
" 'vlinder',\n",
" 'vlinders'},\n",
" {'beuk',\n",
" 'beuken',\n",
" 'driebeukige',\n",
" 'kerkschip',\n",
" 'schip',\n",
" 'zijbeuken',\n",
" 'zijschip'},\n",
" {'beschermd monument',\n",
" 'monument',\n",
" 'monumentaal',\n",
" 'monumentale',\n",
" 'monumenten',\n",
" 'monumentenlijst',\n",
" 'onroerend erfgoed'},\n",
" {'film',\n",
" 'film-',\n",
" 'filmmaker',\n",
" 'geregisseerd',\n",
" 'regie',\n",
" 'regisseur',\n",
" 'regisseuse'},\n",
" {'nederlander',\n",
" 'nederlanders',\n",
" 'nederlands',\n",
" 'nederlandse',\n",
" 'nederlandstalig',\n",
" 'nederlandstalige',\n",
" 'nl'},\n",
" {'davitamon-lotto',\n",
" 'lotto soudal',\n",
" 'lotto-belisol',\n",
" 'omega pharma-lotto',\n",
" 'predictor-lotto',\n",
" 'silence-lotto'},\n",
" {'olympische spelen 2016',\n",
" 'olympische spelen van 2016',\n",
" 'olympische zomerspelen 2016',\n",
" 'olympische zomerspelen van 2016',\n",
" 'rio 2016',\n",
" 'rio de janeiro 2016'},\n",
" {'langebaan',\n",
" 'langebaanschaatser',\n",
" 'schaats',\n",
" 'schaatsen',\n",
" 'schaatser',\n",
" 'schaatsster'},\n",
" {'gif', 'giftig', 'giftige', 'giftigheid', 'toxisch', 'toxische'},\n",
" {'warner',\n",
" 'warner bros',\n",
" 'warner bros.',\n",
" 'warner bros. pictures',\n",
" 'warner bros. records',\n",
" 'warner brothers'},\n",
" {'elpee', 'lp', 'lps', 'plaat', 'platen', 'vinyl'},\n",
" {'gereformeerd',\n",
" 'gereformeerde',\n",
" 'gereformeerde kerk',\n",
" 'gereformeerde kerken',\n",
" 'gereformeerde kerken in nederland',\n",
" 'gereformeerden'},\n",
" {'neoclassicisme',\n",
" 'neoclassicistisch',\n",
" 'neoclassicistische',\n",
" 'neoclassicistische stijl',\n",
" 'neoklassieke',\n",
" 'neoklassieke stijl'},\n",
" {'krijgsmacht', 'landmacht', 'leger', 'legers', 'militair', 'militaire'},\n",
" {'organist', 'orgel', 'orgelpijpen', 'orgels', 'pijpen', 'pijpwerk'},\n",
" {'verzet',\n",
" 'verzetsgroep',\n",
" 'verzetsman',\n",
" 'verzetsstrijder',\n",
" 'verzetsstrijders',\n",
" 'verzetsstrijdster'},\n",
" {'economen',\n",
" 'economie',\n",
" 'economisch',\n",
" 'economische',\n",
" 'economische wetenschappen',\n",
" 'econoom'},\n",
" {'palestijnen',\n",
" 'palestijns',\n",
" 'palestijnse',\n",
" 'palestijnse gebieden',\n",
" 'palestijnse staat',\n",
" 'palestina'},\n",
" {'joden', 'jodium', 'jood', 'joods', 'joodse', 'joodse gemeenschap'},\n",
" {'ier', 'ieren', 'ierland', 'iers', 'ierse', 'ierse republiek'},\n",
" {'deen', 'deens', 'deense', 'denemarken', 'denen', 'koninkrijk denemarken'},\n",
" {'rus', 'rusland', 'russen', 'russisch', 'russische', 'russische rijk'},\n",
" {'koninkrijk zweden', 'swe', 'zweden', 'zweed', 'zweeds', 'zweedse'},\n",
" {'bardiani csf',\n",
" 'bardiani valvole-csf inox',\n",
" 'bardiani-csf-faizanè',\n",
" 'colnago-csf inox',\n",
" 'csf group-navigare'},\n",
" {'communist', 'communiste', 'communisten', 'communistisch', 'pcb'},\n",
" {'aves', 'avifauna', 'gevogelte', 'vogel', 'vogels'},\n",
" {'farnese vini-neri sottoli',\n",
" 'neri sottoli',\n",
" 'southeast',\n",
" 'vini fantini-selle italia',\n",
" 'wilier triestina-southeast'},\n",
" {'leopard trek',\n",
" 'radioshack leopard',\n",
" 'radioshack-leopard',\n",
" 'radioshack-nissan-trek',\n",
" 'trek factory racing'},\n",
" {'htc-highroad',\n",
" 't-mobile team',\n",
" 'team columbia',\n",
" 'team htc-columbia',\n",
" 'team stuttgart'},\n",
" {'wereldkampioenschappen sprint',\n",
" 'wk
sprint',\n",
" 'wk
sprint',\n",
" 'wk sprint',\n",
" 'wk
sprint'},\n",
" {'buis', 'buizen', 'leidingen', 'pijpleiding', 'pijpleidingen'},\n",
" {'hallucinogene',\n",
" 'psychedelica',\n",
" 'psychedelisch',\n",
" 'psychedelische',\n",
" 'psychedelische rock'},\n",
" {'ineos grenadiers', 'sky', 'sky procycling', 'team ineos', 'team sky'},\n",
" {'astana pro team',\n",
" 'astana qazaqstan',\n",
" 'astana-premier tech',\n",
" 'liberty seguros-würth',\n",
" 'pro team astana'},\n",
" {'erfelijk', 'erfelijke', 'erfelijkheid', 'geneticus', 'genetisch'},\n",
" {'olympisch',\n",
" 'olympisch kampioen',\n",
" 'olympisch kampioene',\n",
" 'olympische',\n",
" 'spelen'},\n",
" {'olympische spelen 2008',\n",
" 'olympische spelen in peking',\n",
" 'olympische spelen van peking',\n",
" 'olympische zomerspelen 2008',\n",
" 'peking 2008'},\n",
" {'ghz', 'hertz', 'hz', 'khz', 'mhz'},\n",
" {'magneet', 'magneten', 'magnetisch', 'magnetisch veld', 'magnetische'},\n",
" {'gasthoogleraar', 'gewoon hoogleraar', 'hoogleraar', 'prof.', 'professor'},\n",
" {'evangelisch-luthers',\n",
" 'evangelisch-lutherse',\n",
" 'evangelisch-lutherse kerk',\n",
" 'luthers',\n",
" 'lutherse'},\n",
" {'mitchelton-scott',\n",
" 'orica greenedge',\n",
" 'orica-bikeexchange',\n",
" 'orica-scott',\n",
" 'team bikeexchange'},\n",
" {'fl', 'fl.', 'florida', 'gulden', 'ƒ'},\n",
" {'middelbaar onderwijs',\n",
" 'middelbare',\n",
" 'middelbare scholen',\n",
" 'middelbare school',\n",
" 'secundair'},\n",
" {'habsburg', 'habsburgers', 'habsburgs', 'habsburgse', 'habsburgse rijk'},\n",
" {'astrologen', 'astrologie', 'astrologisch', 'astrologische', 'astroloog'},\n",
" {'belasting', 'belastingen', 'fiscaal', 'fiscale', 'schatting'},\n",
" {'gelatiniseerd', 'gelatiniseerde', 'latijn', 'latijns', 'latijnse'},\n",
" {'wereldkampioenschappen voor junioren',\n",
" 'wjk',\n",
" 'wk junioren',\n",
" 'wk u20',\n",
" 'wk voor junioren'},\n",
" {'wereldkampioenschappen afstanden',\n",
" 'wk
afstanden',\n",
" 'wk
afstanden',\n",
" 'wk afstanden',\n",
" 'wk
afstanden'},\n",
" {'elisabeth',\n",
" 'elisabeth in beieren',\n",
" 'elisabeth van belgië',\n",
" 'koningin elisabeth',\n",
" 'prinses elisabeth'},\n",
" {'ferry', 'pont', 'veer', 'veerdienst', 'veerpont'},\n",
" {'babylon', 'babylonisch', 'babylonische', 'babylonië', 'babyloniërs'},\n",
" {'nederlandse publieke omroep',\n",
" 'npo',\n",
" 'openbare omroep',\n",
" 'publieke',\n",
" 'publieke omroep'},\n",
" {'gelder', 'gelderland', 'gelders', 'gelderse', 'gelre'},\n",
" {'organisch',\n",
" 'organische',\n",
" 'organische stoffen',\n",
" 'organische verbinding',\n",
" 'organische verbindingen'},\n",
" {'electronic', 'electronica', 'elektronica', 'elektronisch', 'elektronische'},\n",
" {'dokter', 'geneeskunde', 'geneeskundige', 'medicijnen', 'medische'},\n",
" {'basisonderwijs',\n",
" 'basisscholen',\n",
" 'basisschool',\n",
" 'lager onderwijs',\n",
" 'lagere school'},\n",
" {'schaak', 'schaakpartij', 'schaakster', 'schaakvereniging', 'schaken'},\n",
" {'gotiek', 'gotisch', 'gotische', 'laatgotisch', 'laatgotische'},\n",
" {'rugby', 'rugbyclub', 'rugbyers', 'rugbyspeler', 'rugbyteam'},\n",
" {'genova', 'genua', 'genuees', 'genuese', 'genuezen'},\n",
" {'be', 'belg', 'belgen', 'belgisch', 'belgische'},\n",
" {'humanisme', 'humanist', 'humanisten', 'humanistisch', 'humanistische'},\n",
" {'siam', 'thai', 'thailand', 'thais', 'thaise'},\n",
" {'beschrijving', 'wapen', 'wapens', 'wapenschild', 'wapenschilden'},\n",
" {'perzen', 'perzisch', 'perzische', 'perzische rijk', 'perzië'},\n",
" {'klinker', 'klinkers', 'vocalen', 'zang', 'zangers'},\n",
" {'filosofe', 'filosofen', 'filosofie', 'filosofisch', 'filosoof'},\n",
" {'let', 'letland', 'lets', 'letse', 'republiek letland'},\n",
" {'litouwen', 'litouwers', 'litouws', 'litouwse', 'ltu'},\n",
" {'bas', 'basgitaar', 'basgitarist', 'bass', 'bassist'},\n",
" {'bosnisch',\n",
" 'bosnische',\n",
" 'bosnië',\n",
" 'bosnië en herzegovina',\n",
" 'bosnië-herzegovina'},\n",
" {'mar', 'marokkaan', 'marokkaans', 'marokkaanse', 'marokko'},\n",
" {'barok', 'barokke', 'barokperiode', 'barokschilder', 'barokstijl'},\n",
" {'serven', 'servisch', 'servische', 'servië', 'serviër'},\n",
" {'mathematica', 'wis', 'wis-', 'wiskunde', 'wiskundige'},\n",
" {'doel', 'doelman', 'doelvrouw', 'gk', 'keeper'},\n",
" {'show', 'theater', 'theatermaker', 'theaterstukken', 'toneel'},\n",
" {'noor', 'noors', 'noorse', 'nor', 'noren'},\n",
" {'hongaar', 'hongaars', 'hongaarse', 'hongaren', 'hongarije'},\n",
" {'fin', 'finland', 'finnen', 'fins', 'finse'},\n",
" {'auteur', 'publicist', 'schrijfster', 'schrijver', 'schrijvers'},\n",
" {'limburg', 'limburgers', 'limburgs', 'limburgse', 'provincie limburg'},\n",
" {'japan', 'japanner', 'japanners', 'japans', 'japanse'},\n",
" {'spaans', 'spaanse', 'spanjaard', 'spanjaarden', 'spanje'},\n",
" {'ingepolderd', 'inpoldering', 'polder', 'polders'},\n",
" {'footon-servetto', 'fuji-servetto', 'geox-tmc', 'scott-american beef'},\n",
" {'androni giocattoli-sidermec',\n",
" 'androni giocattoli-venezuela',\n",
" 'drone hopper-androni giocattoli',\n",
" 'selle italia'},\n",
" {'huis', 'woning', 'woningen', 'woonhuis'},\n",
" {'mythen', 'mythische', 'mythologie', 'mythologische'},\n",
" {'roompot oranje peloton',\n",
" 'roompot-charles',\n",
" 'roompot-nederlandse loterij',\n",
" 'roompot-oranje peloton'},\n",
" {'autocoureur', 'coureur', 'coureurs', 'motorcoureur'},\n",
" {'geïndustrialiseerde', 'industrialisatie', 'industrie', 'industriële'},\n",
" {'eedgenootschap',\n",
" 'eedgenoten',\n",
" 'zwitsers eedgenootschap',\n",
" 'zwitserse eedgenootschap'},\n",
" {'accent.jobs-willems verandas',\n",
" 'circus-wanty-gobert',\n",
" 'intermarché-wanty-gobert matériaux',\n",
" 'wanty-groupe gobert'},\n",
" {'doctor', 'doctoraat', 'doctorstitel', 'ph.d.'},\n",
" {'gymnast', 'gymnaste', 'gymnastiek', 'turner'},\n",
" {'plant', 'planten', 'plantensoort', 'species'},\n",
" {'padvinderij', 'scout', 'scouting', 'scouts'},\n",
" {'alpineskiester', 'alpineskiën', 'alpineskiër', 'skiër'},\n",
" {'institutie', 'instituties', 'instituut', 'organisatie'},\n",
" {'afgezet', 'afzetting', 'afzettingen', 'sedimenten'},\n",
" {'retorica', 'retoriek', 'retorisch', 'retorische'},\n",
" {'bayern', 'bayern münchen', 'bayern münchen ii', 'fc bayern münchen'},\n",
" {'bisdom', 'diocees', 'diocesaan', 'diocesane'},\n",
" {'bladsteel', 'bladstelen', 'bloemstengel', 'knol'},\n",
" {'nationale team', 'nederlands handbalteam', 'nederlands team', 'senioren'},\n",
" {'mtn-qhubeka',\n",
" 'ntt pro cycling',\n",
" 'team dimension data',\n",
" 'team qhubeka-assos'},\n",
" {'gepubliceerd', 'publicatie', 'publiceerde', 'verhandeling'},\n",
" {'ruiter', 'ruiters', 'springen', 'springruiter'},\n",
" {'westen', 'westers', 'westerse', 'westerse beschaving'},\n",
" {'academie van beeldende kunsten',\n",
" 'academie voor beeldende kunsten',\n",
" 'koninklijke academie',\n",
" 'koninklijke academie van beeldende kunsten'},\n",
" {'wereldkampioenschappen allround',\n",
" 'wk
allround',\n",
" 'wk
allround',\n",
" 'wk
allround'},\n",
" {'aartsbisdom utrecht',\n",
" 'aartsbisschop van utrecht',\n",
" 'bisdom utrecht',\n",
" 'bisschop van utrecht'},\n",
" {'vesting', 'vestingwal', 'wal', 'wallen'},\n",
" {'olympische spelen 2000',\n",
" 'olympische spelen in sydney',\n",
" 'olympische spelen van sydney',\n",
" 'olympische zomerspelen 2000'},\n",
" {'maximiliaan',\n",
" 'maximiliaan i',\n",
" 'maximiliaan van habsburg',\n",
" 'maximiliaan van oostenrijk'},\n",
" {'athene 2004',\n",
" 'olympische spelen in athene',\n",
" 'olympische spelen van athene',\n",
" 'olympische zomerspelen 2004'},\n",
" {'manufacturen', 'textiel', 'textielfabriek', 'weefsel'},\n",
" {'katjoesja',\n",
" 'team katjoesja',\n",
" 'team katjoesja alpecin',\n",
" 'tinkoff credit systems'},\n",
" {'gbr fea', 'gbr spr', 'sil', 'silverstone'},\n",
" {'belkin', 'jumbo-visma', 'team jumbo-visma', 'team lottonl-jumbo'},\n",
" {'cannondale', 'liquigas', 'liquigas-cannondale', 'liquigas-doimo'},\n",
" {'impressionisme',\n",
" 'impressionisten',\n",
" 'impressionistisch',\n",
" 'impressionistische'},\n",
" {'bourgondische',\n",
" 'bourgondiërs',\n",
" 'hertog van bourgondië',\n",
" 'hertogen van bourgondië'},\n",
" {'gemeenteraad', 'gemeenteraadslid', 'raad', 'raadslid'},\n",
" {'staats', 'staatse', 'staatse leger', 'staatse troepen'},\n",
" {'carnivore', 'carnivoren', 'roofdieren', 'vleesetende'},\n",
" {'patriot', 'patriotten', 'patriottisch', 'patriottische'},\n",
" {'individuele tijdrit', 'itt', 'tijdrijden', 'tijdrit'},\n",
" {'klassiek', 'klassieke', 'klassieke muziek', 'muziekgeschiedenis'},\n",
" {'gesynthetiseerd', 'synthese', 'synthetisch', 'synthetische'},\n",
" {'zuid-', 'zuid-amerika', 'zuid-amerikaans', 'zuid-amerikaanse'},\n",
" {'ratio', 'rationeel', 'rationele', 'rede'},\n",
" {'gereduceerd', 'reducerende', 'reducerende stoffen', 'reductie'},\n",
" {'orangisme', 'orangist', 'orangisten', 'orangistische'},\n",
" {'caisse depargne',\n",
" 'caisse depargne-illes balears',\n",
" 'movistar',\n",
" 'movistar team'},\n",
" {'diplomaat', 'diplomaten', 'diplomatie', 'gezant'},\n",
" {'biografie', 'biografieën', 'biografisch', 'biografische'},\n",
" {'inheems', 'inheemse', 'inheemse bevolking', 'inheemsen'},\n",
" {'heiden', 'heidenen', 'heidens', 'heidense'},\n",
" {'republikein', 'republikeinen', 'republikeins', 'republikeinse'},\n",
" {'socialisme', 'socialisten', 'socialistisch', 'socialistische'},\n",
" {'assur', 'assyrische', 'assyriërs', 'aššur'},\n",
" {'allegorie', 'allegorieën', 'allegorisch', 'allegorische'},\n",
" {'aanslag', 'aanslagen', 'terreur', 'terroristische aanslagen'},\n",
" {'afghaans', 'afghanen', 'afghanistan', 'islamitisch emiraat afghanistan'},\n",
" {'album', 'albums', 'soloalbum', 'studioalbum'},\n",
" {'laatromaanse', 'romaans', 'romaanse', 'romaanse stijl'},\n",
" {'hulpkerk', 'kapel', 'kapelletje', 'veldkapel'},\n",
" {'heilig', 'heilig verklaard', 'heilige', 'heiligverklaring'},\n",
" {'archeologe', 'archeologen', 'archeologisch onderzoek', 'archeoloog'},\n",
" {'afbeeldingen', 'grafiek', 'prent', 'prenten'},\n",
" {'app', 'applicatie', 'applicaties', 'apps'},\n",
" {'space shuttle', 'spaceshuttle', 'spaceshuttlemissie', 'spaceshuttles'},\n",
" {'europa cup', 'europa cup i', 'europacup', 'europacup i'},\n",
" {'byzantijns', 'byzantijnse', 'byzantium', 'constantinopel'},\n",
" {'marine', 'marineofficier', 'nederlandse marine', 'officier'},\n",
" {'beeld', 'beelden', 'sculpturen', 'sculptuur'},\n",
" {'islam', 'islamitische', 'moslim', 'moslims'},\n",
" {'amazigh', 'berber', 'berbers', 'berberse'},\n",
" {'cetacea', 'walvis', 'walvisachtigen', 'walvissen'},\n",
" {'arabier', 'arabieren', 'arabisch', 'arabische'},\n",
" {'europa', 'europees', 'europese', 'europese vasteland'},\n",
" {'kunstschilder', 'kunstschilders', 'schilder', 'schilderen'},\n",
" {'fruit', 'schil', 'vrucht', 'vruchten'},\n",
" {'martelaar', 'martelaren', 'martelares', 'marteldood'},\n",
" {'disney', 'disneyfilm', 'disneys', 'walt disney'},\n",
" {'eclecticisme', 'eclectisch', 'eclectische', 'eclectische stijl'},\n",
" {'genus', 'geslacht', 'geslachten', 'geslachtsnaam'},\n",
" {'normandisch', 'normandische', 'normandië', 'normandiërs'},\n",
" {'slaaf', 'slaven', 'slavenhandel', 'slavernij'},\n",
" {'coach', 'trainer', 'voetbalcoach', 'voetbaltrainer'},\n",
" {'activisme', 'activist', 'activisten', 'activistische'},\n",
" {'coronacrisis', 'coronapandemie', 'coronavirus', 'covid-19'},\n",
" {'evolutie', 'evolutionair', 'evolutionaire', 'geëvolueerd'},\n",
" {'dictator', 'dictatoriaal', 'dictatoriale', 'dictatuur'},\n",
" {'beieren', 'beiers', 'beierse', 'hertogdom beieren'},\n",
" {'christelijk', 'christelijke', 'christen', 'christenen'},\n",
" {'dieren', 'dierenrijk', 'diersoorten', 'fauna'},\n",
" {'democraat', 'democraten', 'democratisch', 'democratische'},\n",
" {'tropen', 'tropisch', 'tropische', 'tropische gebieden'},\n",
" {'catalaans', 'catalaanse', 'catalanen', 'catalonië'},\n",
" {'aartsbisschop van keulen', 'keulen', 'keulse', 'köln'},\n",
" {'christus', 'jezus', 'jezus christus', 'jezus van nazareth'},\n",
" {'basken', 'baskenland', 'baskisch', 'baskische'},\n",
" {'fysica', 'fysische', 'natuur-', 'natuurkunde'},\n",
" {'tibet', 'tibetaans', 'tibetaanse', 'tibetanen'},\n",
" {'slowaaks', 'slowaakse', 'slowaakse republiek', 'slowakije'},\n",
" {'koninkrijk pruisen', 'pruisen', 'pruisisch', 'pruisische'},\n",
" {'venetiaans', 'venetiaanse', 'venetianen', 'venetië'},\n",
" {'macedonisch', 'macedonische', 'macedonië', 'noord-macedonië'},\n",
" {'baan', 'baanwielrennen', 'baanwielrenner', 'scratch'},\n",
" {'rode leger', 'sovjet', 'sovjets', 'sovjettroepen'},\n",
" {'comedy', 'komedie', 'komisch', 'komische'},\n",
" {'sloveens', 'sloveense', 'slovenen', 'slovenië'},\n",
" {'oekraïens', 'oekraïense', 'oekraïne', 'ukr'},\n",
" {'game', 'gamer', 'gaming', 'spel'},\n",
" {'schotland', 'schots', 'schotse', 'schotten'},\n",
" {'bretagne', 'bretoense', 'bretons', 'bretonse'},\n",
" {'metro', 'metrolijn', 'metrolijnen', 'u-bahn'},\n",
" {'oostenrijk', 'oostenrijkers', 'oostenrijks', 'oostenrijkse'},\n",
" {'kroaat', 'kroaten', 'kroatisch', 'kroatische'},\n",
" {'egypte', 'egyptenaren', 'egyptisch', 'egyptische'},\n",
" {'gitaar', 'gitarist', 'leadgitaar', 'slaggitaar'},\n",
" {'eerste klasse', 'eerste klasse a', 'eersteklasser', 'jupiler league'},\n",
" {'jazz', 'jazzpianist', 'pianist', 'pianiste'},\n",
" {'tsjechen', 'tsjechisch', 'tsjechische', 'tsjechië'},\n",
" {'roemeens', 'roemeense', 'roemenen', 'roemenië'},\n",
" {'ch', 'sui', 'zwitserland', 'zwitsers'},\n",
" {'portugal', 'portugees', 'portugese', 'portugezen'},\n",
" {'bra', 'braziliaans', 'braziliaanse', 'brazilië'},\n",
" {'groothertogdom luxemburg', 'luxemburg', 'luxemburgs', 'luxemburgse'},\n",
" {'polen', 'pool', 'pools', 'poolse'},\n",
" {'aus', 'australisch', 'australische', 'australië'},\n",
" {'china', 'chinees', 'chinese', 'chinezen'},\n",
" {'fr', 'frans', 'franstalig', 'franstalige'},\n",
" {'gevaccineerd', 'vaccin', 'vaccinatie'},\n",
" {'gp van zwitserland', 'grand prix van zwitserland', 'zwi'},\n",
" {'betovering', 'magische', 'toveren'},\n",
" {'wereldkampioenschap voetbal onder 20', 'wk onder 20', 'wk onder-20'},\n",
" {'kegel', 'kegels', 'taps'},\n",
" {'burgerlijk', 'civiel recht', 'civiele procedure'},\n",
" {'duitse keizer', 'keizerlijke', 'rooms-duits'},\n",
" {'hyacint', 'hyacinten', 'hyacinthus'},\n",
" {'creolen', 'creool', 'creoolse'},\n",
" {'nations league', 'uefa nations league', 'uefa nations league 2020/21'},\n",
" {'bok', 'geit', 'geiten'},\n",
" {'comedyserie', 'komedieserie', 'sitcom'},\n",
" {'container', 'containers', 'containerterminal'},\n",
" {'jaren 80', 'jaren tachtig', 'tachtig'},\n",
" {'jaren 70', 'jaren zeventig', 'zeventig'},\n",
" {'bisschop', 'bisschopswijding', 'gewijd'},\n",
" {'profetie', 'voorspelling', 'voorspellingen'},\n",
" {'schieten', 'schietsport', 'schutter'},\n",
" {'gesteriliseerd', 'sterilisatie', 'steriliseren'},\n",
" {'procyon', 'wasbeer', 'wasberen'},\n",
" {'bora-argon 18', 'bora-hansgrohe', 'team netapp-endura'},\n",
" {'bourgeois', 'burger', 'burgers'},\n",
" {'minimalisme', 'minimalistisch', 'minimalistische'},\n",
" {'neutraal', 'neutrale', 'neutraliteit'},\n",
" {'topsport vlaanderen',\n",
" 'topsport vlaanderen-baloise',\n",
" 'topsport vlaanderen-mercator'},\n",
" {'bisdom luik', 'bisschop van luik', 'luiks'},\n",
" {'cofidis', 'cofidis, le crédit en ligne', 'cofidis, solutions crédits'},\n",
" {'discipline', 'disciplines', 'vakgebied'},\n",
" {'fiets', 'fietsen', 'fietsers'},\n",
" {'geschiedkundige', 'historica', 'historicus'},\n",
" {'rijtuig', 'rijtuigen', 'wagons'},\n",
" {'novo nordisk', 'team novo nordisk', 'team type 1-sanofi'},\n",
" {'cinema', 'cinematograaf', 'videos'},\n",
" {'ag2r la mondiale', 'ag2r-citroën', 'ag2r-la mondiale'},\n",
" {'jaren 90', 'jaren negentig', 'negentig'},\n",
" {'gestucte', 'pleister', 'stucwerk'},\n",
" {'ek
allround', 'ek
allround', 'europese titel'},\n",
" {'geschiedenis', 'historisch', 'historische'},\n",
" {'gestalkt', 'stalken', 'stalker'},\n",
" {'zout water', 'zoutwater', 'zoutwatervis'},\n",
" {'bos', 'bossen', 'woud'},\n",
" {'dnipro', 'dnipropetrovsk', 'dnjepropetrovsk'},\n",
" {'historiserende', 'historiserende stijl', 'historisme'},\n",
" {'tweezaadlobbig', 'tweezaadlobbige', 'tweezaadlobbige planten'},\n",
" {'visigoten', 'visigotisch', 'visigotische'},\n",
" {'agrarische', 'agrarische sector', 'landbouw'},\n",
" {'continu', 'continue', 'continuïteit'},\n",
" {'discus', 'discuswerpen', 'discuswerper'},\n",
" {'bastaard', 'bastaarden', 'hybridisatie'},\n",
" {'sanoma', 'sanoma media', 'sanoma uitgevers'},\n",
" {'amsterdam sloterdijk', 'sloterdijk', 'station sloterdijk'},\n",
" {'districtsraad', 'districtsraden', 'districtsvoorzitter'},\n",
" {'mediterraan', 'mediterrane', 'middellandse zeegebied'},\n",
" {'polytechnische school', 'technische hogeschool', 'technische universiteit'},\n",
" {'transcendent', 'transcendente', 'transcendentie'},\n",
" {'arkéa-samsic', 'bretagne-séché environnement', 'fortuneo-vital concept'},\n",
" {'huis van oranje', 'oranje-nassau', 'oranjes'},\n",
" {'postmoderne', 'postmodernisme', 'postmodernistische'},\n",
" {'ampère', 'stroom', 'stroomsterkte'},\n",
" {'antwerp', 'antwerp fc', 'royal antwerp'},\n",
" {'golfoorlog', 'oorlog in irak', 'tweede golfoorlog'},\n",
" {'muziekpedagoog', 'pedagoge', 'pedagoog'},\n",
" {'ramen', 'venster', 'vensters'},\n",
" {'moraliteit', 'moreel', 'morele'},\n",
" {'resistent', 'resistente', 'resistentie'},\n",
" {'geologisch tijdvak', 'geologische geschiedenis', 'tijdperk'},\n",
" {'olympische spelen 1984',\n",
" 'olympische spelen van los angeles',\n",
" 'olympische zomerspelen 1984'},\n",
" {'archeologische vindplaats', 'site', 'sites'},\n",
" {'kg', 'kilo', 'µg'},\n",
" {'spoorlijn', 'spoorweg', 'spoorwegen'},\n",
" {'viking', 'vikings', 'vikingtijd'},\n",
" {'aziatisch', 'aziatische', 'azië'},\n",
" {'bahrain mclaren', 'bahrain-merida', 'bahrain-victorious'},\n",
" {'twente', 'twents', 'twentse'},\n",
" {'zuid-nederland', 'zuid-nederlands', 'zuidelijke nederlanden'},\n",
" {'mannelijk', 'mannelijke', '♂'},\n",
" {'modernisme', 'modernistische', 'moderniteit'},\n",
" {'sociaal', 'sociale', 'sociale wetenschappen'},\n",
" {'criticus', 'kritisch', 'muziekcriticus'},\n",
" {'expressionisme', 'expressionistische', 'expressionistische stijl'},\n",
" {'gaskamer', 'vergassing', 'vergast'},\n",
" {'lombarden', 'longobarden', 'longobardische'},\n",
" {'abstract', 'abstracte', 'abstractie'},\n",
" {'cult', 'cultstatus', 'scene'},\n",
" {'minister van volksgezondheid',\n",
" 'ministerie van volksgezondheid',\n",
" 'volksgezondheid'},\n",
" {'arenberg', 'hertog van arenberg', 'hertogdom arenberg'},\n",
" {'bekerwinnaar sovjet-unie', 'bekerwinnaar van de sovjet-unie', 'ussr cup'},\n",
" {'belgisch voetbalelftal', 'belgische nationale ploeg', 'nationale ploeg'},\n",
" {'doodstraf', 'executie', 'geëxecuteerd'},\n",
" {'holocaust', 'jodenvervolging', 'shoah'},\n",
" {'motorrijtuig', 'motorwagen', 'motorwagens'},\n",
" {'whig', 'whig party', 'whigs'},\n",
" {'ca', 'california', 'californië'},\n",
" {'icon', 'iconen', 'icoon'},\n",
" {'feldwebel', 'sergeant', 'unteroffizier'},\n",
" {'scenarios', 'script', 'scripts'},\n",
" {'act', 'akte', 'akten'},\n",
" {'editor', 'filmeditor', 'filmmonteur'},\n",
" {'iris', 'irissen', 'lis'},\n",
" {'sirene', 'sirenen', 'sirenes'},\n",
" {'longslak', 'slak', 'slakken'},\n",
" {'dualisme', 'dualistisch', 'dualistische'},\n",
" {'breedte', 'nb', 'noorderbreedte'},\n",
" {'academy award', 'academy awards', 'oscar'},\n",
" {'blind', 'blinde', 'blinden'},\n",
" {'cover', 'coverband', 'covers'},\n",
" {'dode', 'dood', 'stoffelijk overschot'},\n",
" {'10.000 m', '10.000 meter', '10000 m'},\n",
" {'dame', 'dames', 'vrouw'},\n",
" {'labour', 'labour party', 'labour-partij'},\n",
" {'miniaturen', 'miniaturist', 'miniatuur'},\n",
" {'isaan', 'noordoost', 'noordoosten'},\n",
" {'serie', 'series', 'televisieserie'},\n",
" {'billboard', 'billboards', 'de amerikaanse hitlijst'},\n",
" {'beperking', 'handicap', 'mindervaliden'},\n",
" {'saami', 'samen', 'sami'},\n",
" {'akoestiek', 'akoestisch', 'akoestische'},\n",
" {'science fiction', 'sciencefiction', 'sf'},\n",
" {'strategie', 'strategisch', 'strategische'},\n",
" {'piano', 'pianoforte', 'pianoles'},\n",
" {'de kempen', 'kempen', 'kempens'},\n",
" {'leningrad', 'sint-petersburg', 'st. petersburg'},\n",
" {'somalisch', 'somalische', 'somalië'},\n",
" {'kat', 'kater', 'katten'},\n",
" {'indo-europeanen', 'indo-europees', 'indo-europese'},\n",
" {'gothenburg', 'göteborg', 'göteborg c'},\n",
" {'nederzetting', 'plaats', 'woonplaats'},\n",
" {'beker van rusland', 'bekerwinnaar rusland', 'bekerwinnaar van rusland'},\n",
" {'drone', 'drones', 'uav'},\n",
" {'jong vitesse', 'sbv vitesse', 'vitesse'},\n",
" {'opstand', 'rebel', 'rebellen'},\n",
" {'caraïbische', 'cariben', 'caribische'},\n",
" {'molukken', 'molukkers', 'molukse'},\n",
" {'tantra', 'tantras', 'tantrische'},\n",
" {'grootloge', 'obediëntie', 'vrijmetselaarsloge'},\n",
" {'conservatief', 'conservatieve', 'conservatieven'},\n",
" {'leen', 'leengoed', 'lenen'},\n",
" {'fellow', 'leden', 'lid'},\n",
" {'im', 'internationaal meester', 'meester'},\n",
" {'jaren 10', 'jaren tien', 'tien'},\n",
" {'bus', 'buslijn', 'busvervoer'},\n",
" {'economische zaken',\n",
" 'minister van economische zaken',\n",
" 'ministerie van economische zaken'},\n",
" {'binnenlandse zaken',\n",
" 'minister van binnenlandse zaken',\n",
" 'ministerie van binnenlandse zaken'},\n",
" {'grote', 'pacific', 'pacifische'},\n",
" {'boerderij', 'hoeve', 'hofstede'},\n",
" {'europees hof van justitie',\n",
" 'hof van justitie',\n",
" 'hof van justitie van de europese gemeenschappen'},\n",
" {'academie',\n",
" 'academie voor schone kunsten',\n",
" 'koninklijke academie voor schone kunsten'},\n",
" {'promotie', 'promoveerde', 'promoveren'},\n",
" {'regent', 'regenten', 'regentes'},\n",
" {'1.1', '1.2', 'classificatie'},\n",
" {'versnelling', 'versnellingen', 'versnellingsbak'},\n",
" {'den', 'dennen', 'dennenboom'},\n",
" {'voeding', 'voedingsmiddelen', 'voedsel'},\n",
" {'aram', 'aramees', 'aramese'},\n",
" {'botanica', 'botanicus', 'plantkundige'},\n",
" {'alkali', 'base', 'basen'},\n",
" {'bestand', 'bestanden', 'wapenstilstand'},\n",
" {'dominicaans', 'dominicaanse', 'dominicaanse republiek'},\n",
" {'taranto', 'tarente', 'tarentum'},\n",
" {'kristal', 'kristallen', 'kristallijne'},\n",
" {'codex', 'codices', 'manuscripten'},\n",
" {'naturalisme', 'naturalistisch', 'naturalistische'},\n",
" {'wortel', 'wortelen', 'wortels'},\n",
" {'onafhankelijk', 'onafhankelijke', 'onafhankelijkheid'},\n",
" {'anglicaans', 'anglicaanse', 'anglicaanse kerk'},\n",
" {'protestants', 'protestantse', 'protestantse kerk'},\n",
" {'duurzaam', 'duurzaamheid', 'duurzame'},\n",
" {'leuvense universiteit', 'universiteit leuven', 'universiteit van leuven'},\n",
" {'breuk', 'breuken', 'teller'},\n",
" {'deel', 'delen', 'deling'},\n",
" {'maya', 'mayas', 'mayastad'},\n",
" {'orthodox', 'orthodoxe', 'orthodoxe kerk'},\n",
" {'antillen', 'antilliaanse', 'nederlandse antillen'},\n",
" {'keyboards', 'toetsen', 'toetsenist'},\n",
" {'rollenspel', 'rollenspellen', 'rpg'},\n",
" {'counties', 'county', 'countys'},\n",
" {'jammu en kasjmir', 'kashmir', 'kasjmir'},\n",
" {'schutterij', 'schutterijen', 'schuttersgilde'},\n",
" {'overeenkomst', 'transactie', 'transacties'},\n",
" {'hoer', 'prostituee', 'prostituees'},\n",
" {'geloof', 'godsdienst', 'religieuze'},\n",
" {'magazine', 'periodiek', 'tijdschrift'},\n",
" {'dubbelspion', 'spion', 'spionage'},\n",
" {'standard', 'standard de liège', 'standard luik'},\n",
" {'leadzanger', 'zanger', 'zangeres'},\n",
" {'punk', 'punkband', 'punkbeweging'},\n",
" {'varken', 'varkens', 'zwijnen'},\n",
" {'reïncarnatie', 'wedergeboorte', 'wedergeboren'},\n",
" {'primera b', 'segunda división', 'segunda división a'},\n",
" {'cultureel', 'culturen', 'cultuur'},\n",
" {'ddr', 'duitse democratische republiek', 'oost-duitsland'},\n",
" {'reggio', 'reggio emilia', 'reggio nellemilia'},\n",
" {'bangladesh', 'bengaalse', 'bengalen'},\n",
" {'chirurgische ingreep', 'operatie', 'operaties'},\n",
" {'europees voetbal', 'europese competities', 'toernooien'},\n",
" {'maat', 'maatsoort', 'maten'},\n",
" {'j', 'jaar', 'jaren'},\n",
" {'mongolen', 'mongools', 'mongoolse'},\n",
" {'vietnam', 'vietnamees', 'vietnamese'},\n",
" {'financiën', 'minister van financiën', 'ministerie van financiën'},\n",
" {'open source', 'open-source', 'opensource'},\n",
" {'toscaans', 'toscaanse', 'toscane'},\n",
" {'matteüs', 'mattheus', 'mattheüs'},\n",
" {'morfologie', 'morfologisch', 'morfologische'},\n",
" {'strip', 'stripreeks', 'stripserie'},\n",
" {'vertaalster', 'vertaler', 'vertaling'},\n",
" {'vulkanisch', 'vulkanische', 'vulkanische activiteit'},\n",
" {'dogma', 'dogmas', 'dogmatische'},\n",
" {'madagaskar', 'malagassisch', 'malagassische'},\n",
" {'koningin wilhelmina', 'prinses wilhelmina', 'wilhelmina'},\n",
" {'canon', 'canoniek', 'canonieke'},\n",
" {'gemeentefusie', 'gemeentelijke herindeling', 'herindeling'},\n",
" {'geïmproviseerd', 'improvisatie', 'improviseren'},\n",
" {'justitie', 'minister van justitie', 'ministerie van justitie'},\n",
" {'eth', 'ethiopisch', 'ethiopië'},\n",
" {'motorvermogen', 'vermogen', 'vermogens'},\n",
" {'adelaar', 'adelaars', 'arend'},\n",
" {'mijn', 'mijnen', 'zeemijn'},\n",
" {'landskampioen sovjet-unie',\n",
" 'landskampioen van de sovjet-unie',\n",
" 'landskampioenschap van de sovjet-unie'},\n",
" {'piramide', 'piramiden', 'piramides'},\n",
" {'bafta', 'bafta award', 'bafta awards'},\n",
" {'friese', 'friesland', 'friezen'},\n",
" {'territoria', 'territoriaal', 'territorium'},\n",
" {'harmonie', 'harmonieën', 'harmonisch'},\n",
" {'sprinten', 'sprinter', 'sprinters'},\n",
" {'kolom', 'zuil', 'zuilen'},\n",
" {'arrangeerde', 'arrangement', 'arrangeur'},\n",
" {'graaf', 'graf', 'graven'},\n",
" {'realisme', 'realistisch', 'realistische'},\n",
" {'schaap', 'schapen', 'schapenvlees'},\n",
" {'communautaire', 'eu', 'europese unie'},\n",
" {'maleise', 'maleisisch', 'maleisië'},\n",
" {'zwart', 'zwarte', 'zwarten'},\n",
" {'hobo', 'hobos', 'hoboïst'},\n",
" {'syrisch', 'syrische', 'syrië'},\n",
" {'de smurfen', 'smurf', 'smurfen'},\n",
" {'bisdom münster', 'bisschop van münster', 'münster'},\n",
" {'astronomen', 'astronoom', 'sterrenkundige'},\n",
" {'nobelprijs', 'nobelprijswinnaar', 'nobelprijswinnares'},\n",
" {'oezbeeks', 'oezbeekse', 'oezbekistan'},\n",
" {'bijbel', 'bijbelboek', 'bijbelse'},\n",
" {'thema', 'themas', 'thematiek'},\n",
" {'lichaam', 'lichamen', 'menselijk lichaam'},\n",
" {'staal', 'staalindustrie', 'stalen'},\n",
" {'oostfront', 'westelijk front', 'westfront'},\n",
" {'geallieerd', 'geallieerde', 'geallieerden'},\n",
" {'buitenlandse zaken',\n",
" 'minister van buitenlandse zaken',\n",
" 'ministerie van buitenlandse zaken'},\n",
" {'stads', 'stadsgezicht', 'stadsgezichten'},\n",
" {'consul', 'consulaat', 'consuls'},\n",
" {'aartsbisdom mainz', 'aartsbisschop van mainz', 'mainz'},\n",
" {'handschrift', 'handschriften', 'manuscript'},\n",
" {'sint truiden', 'sint-truiden', 'sint-truidense vv'},\n",
" {'waalse', 'walen', 'wallonië'},\n",
" {'orde', 'orden', 'ordes'},\n",
" {'fotomodel', 'model', 'modellen'},\n",
" {'hertogdom lotharingen', 'lotharingen', 'lotharingse'},\n",
" {'azerbeidzjaans', 'azerbeidzjaanse', 'azerbeidzjan'},\n",
" {'moldavisch', 'moldavische', 'moldavië'},\n",
" {'oost-vlaams', 'oost-vlaamse', 'oost-vlaanderen'},\n",
" {'beharing', 'haar', 'haard'},\n",
" {'franken', 'frankisch', 'frankische'},\n",
" {'boogschieten', 'boogschutter', 'boogschutters'},\n",
" {'germaans', 'germaanse', 'germaanse talen'},\n",
" {'producent', 'producenten', 'producer'},\n",
" {'trinidad', 'trinidad & tobago', 'trinidad en tobago'},\n",
" {'wit-rusland', 'wit-russisch', 'wit-russische'},\n",
" {'filipijnen', 'filipijns', 'phi'},\n",
" {'kazachs', 'kazachse', 'kazachstan'},\n",
" {'franse bezetting', 'franse overheersing', 'franse tijd'},\n",
" {'romantiek', 'romantisch', 'romantische'},\n",
" {'albanees', 'albanese', 'albanezen'},\n",
" {'georgisch', 'georgische', 'georgië'},\n",
" {'sur', 'surinaams', 'suriname'},\n",
" {'mediolanum', 'milaan', 'milano'},\n",
" {'om', 'openbaar ministerie', 'parket'},\n",
" {'napels', 'napolitaanse', 'neapolis'},\n",
" {'bulgaars', 'bulgaarse', 'bulgaren'},\n",
" {'theologe', 'theologen', 'theoloog'},\n",
" {'chemie', 'chemische', 'scheikunde'},\n",
" {'alessandro farnese', 'hertog van parma', 'parma'},\n",
" {'lb', 'lublin', 'pound'},\n",
" {'captain', 'kapitein', 'kapiteins'},\n",
" {'bouwkundig', 'kunstwerk', 'kunstwerken'},\n",
" {'zuid-afrika', 'zuid-afrikaans', 'zuid-afrikaanse'},\n",
" {'schepen', 'schepenen', 'vaartuig'},\n",
" {'leiden', 'leids', 'leidse'},\n",
" {'alcohol', 'alcoholgebruik', 'alcoholische'},\n",
" {'goud', 'gouden', 'gouden plaat'},\n",
" {'boheems', 'boheemse', 'bohemen'},\n",
" {'malta', 'maltees', 'maltese'},\n",
" {'cuba', 'cubaan', 'cubaanse'},\n",
" {'fluit', 'fluiten', 'fluitist'},\n",
" {'iraans', 'iraanse', 'iran'},\n",
" {'indonesisch', 'indonesische', 'indonesië'},\n",
" {'joegoslavisch', 'joegoslavische', 'joegoslavië'},\n",
" {'schimmel', 'schimmels', 'schimmelziekte'},\n",
" {'saksen', 'saksisch', 'saksische'},\n",
" {'ijsland', 'ijslands', 'ijslandse'},\n",
" {'rhône', 'rhônedal', 'rhônevallei'},\n",
" {'zeeland', 'zeeuwen', 'zeeuwse'},\n",
" {'plp', 'prl', 'pvv'},\n",
" {'liberaal', 'liberale', 'liberalen'},\n",
" {'tur', 'turkije', 'turks voetbalelftal'},\n",
" {'drum', 'drums', 'slagwerk'},\n",
" {'senaat', 'senator', 'senatoren'},\n",
" {'drama', 'dramafilm', 'dramaserie'},\n",
" {'atheense', 'athene', 'atheners'},\n",
" {'syn.', 'synoniem', 'synoniemen'},\n",
" {'tweede klasse', 'tweede niveau', 'tweedeklasser'},\n",
" {'brussel', 'brussels', 'brusselse'},\n",
" {'holland', 'hollands', 'hollandse'},\n",
" {'liège', 'luik', 'luikse'},\n",
" {'antwerpen', 'antwerps', 'antwerpse'},\n",
" {'new york', 'new york city', 'ny'},\n",
" {'utrecht', 'utrechts', 'utrechtse'},\n",
" {'en', 'engels', 'engelstalige'},\n",
" {'groningen', 'groninger', 'groningse'},\n",
" {'halve finale', 'halve finales'},\n",
" {'nk sprint', 'nk
sprint'},\n",
" {'jan pronk', 'pronk'},\n",
" {'sonnet', 'sonnetten'},\n",
" {'groeve', 'steengroeve'},\n",
" {'corolla', 'toyota corolla'},\n",
" {'versterker', 'versterkers'},\n",
" {'reproductie', 'voortplanting'},\n",
" {'aspect', 'aspecten'},\n",
" {'amir', 'emir'},\n",
" {'erf', 'erven'},\n",
" {'ernst lodewijk', 'ernst lodewijk van hessen-darmstadt'},\n",
" {'gemeentearchief', 'stadsarchief'},\n",
" {'he', 'helium'},\n",
" {'jaren 60', 'jaren zestig'},\n",
" {'julia caesaris', 'julia caesaris maior'},\n",
" {'natie', 'nationale'},\n",
" {'lood', 'pb'},\n",
" {'lezing', 'presentatie'},\n",
" {'prince of wales', 'prins van wales'},\n",
" {'beatrix', 'prinses beatrix'},\n",
" {'quezon', 'tayabas'},\n",
" {'commissariaten', 'rvc'},\n",
" {'eredivisie 2011/12', 'seizoen 2011/12'},\n",
" {'strijker', 'strijkers'},\n",
" {'tl', 'vmbo-t'},\n",
" {'pdc world darts championship', 'world darts championship'},\n",
" {'algonkin', 'algonquin'},\n",
" {'athelstan', 'æthelstan'},\n",
" {'bridge', 'bridger'},\n",
" {'british open', 'brits open'},\n",
" {'caja rural', 'caja rural-seguros rga'},\n",
" {'dwangarbeid', 'dwangarbeiders'},\n",
" {'figurine', 'figurines'},\n",
" {'gate', 'gates'},\n",
" {'gedenkplaat', 'gedenksteen'},\n",
" {'hond', 'honden'},\n",
" {'comedian', 'humorist'},\n",
" {'inhoud', 'volume'},\n",
" {'israëlisch-palestijns conflict', 'israëlisch-palestijnse conflict'},\n",
" {'gould', 'john gould'},\n",
" {'landskampioen italië', 'lega basket serie a'},\n",
" {'grote of sint-martinuskerk', 'martinuskerk'},\n",
" {'michael schumacher', 'schumacher'},\n",
" {'mozaïek', 'mozaïeken'},\n",
" {'naald', 'naalden'},\n",
" {'michajlov', 'nikolaj michajlov'},\n",
" {'nk allround', 'nk
allround'},\n",
" {'ode', 'oden'},\n",
" {'olie-', 'oliemolen'},\n",
" {'palestra itália', 'palmeiras'},\n",
" {'grodziski', 'powiat grodziski'},\n",
" {'recitatie', 'reciteren'},\n",
" {'regenwoud', 'regenwouden'},\n",
" {'revisionisme', 'revisionistische'},\n",
" {'sagan', 'żagań'},\n",
" {'saint martin', 'saint-martin'},\n",
" {'ambrosius', 'santambrogio'},\n",
" {'eredivisie 2008/09', 'seizoen 2008/09'},\n",
" {'seoel', 'seoul'},\n",
" {'tetraëder', 'tetraëdrisch'},\n",
" {'hr.ms. tromp', 'tromp'},\n",
" {'t-rex', 'tyrannosaurus rex'},\n",
" {'conference league', 'uefa europa conference league'},\n",
" {'nippo-vini fantini', 'vini fantini nippo'},\n",
" {'wake', 'wake island'},\n",
" {'algemeen directeur', 'ceo'},\n",
" {'apocrief', 'apocriefe'},\n",
" {'belijdenis', 'geloofsbelijdenis'},\n",
" {'cliché', 'clichés'},\n",
" {'dol', 'dol-de-bretagne'},\n",
" {'dvd', 'dvd-speler'},\n",
" {'egel', 'egels'},\n",
" {'efeze', 'ephesus'},\n",
" {'arbela', 'erbil'},\n",
" {'exarch', 'exarchaat'},\n",
" {'expeditie', 'expedities'},\n",
" {'futen', 'fuut'},\n",
" {'gomel', 'homel'},\n",
" {'huisprelaat', 'pauselijk huisprelaat'},\n",
" {'inr', 'nir'},\n",
" {'gekroond', 'kroning'},\n",
" {'lage vloer', 'lagevloer'},\n",
" {'emanuel', 'manuel i'},\n",
" {'maria amalia', 'maria amalia van oostenrijk'},\n",
" {'nexus', 'the nexus'},\n",
" {'onderwijs', 'vorming'},\n",
" {'parachute', 'parachutisten'},\n",
" {'pragmatisch', 'pragmatische'},\n",
" {'radiostraling', 'rf'},\n",
" {'diagram', 'schema'},\n",
" {'sector', 'sectoren'},\n",
" {'st albans', 'st. albans'},\n",
" {'saint paul', 'st. paul'},\n",
" {'the lord of the rings: the return of the king', 'the return of the king'},\n",
" {'tijl uilenspiegel', 'uilenspiegel'},\n",
" {'verdrag van versailles', 'vrede van versailles'},\n",
" {'wk 1998', 'wk in frankrijk'},\n",
" {'autonomie', 'zelfstandig'},\n",
" {'augustijn', 'augustijnenklooster'},\n",
" {'bas jacobs', 'jacobs'},\n",
" {'edingen', 'enghien'},\n",
" {'els', 'elzen'},\n",
" {'everhard iii', 'everhard iii van württemberg'},\n",
" {'famagusta', 'gazimağusa'},\n",
" {'assemblée nationale', 'franse parlement'},\n",
" {'futurisme', 'futuristen'},\n",
" {'gregoriaans', 'gregoriaanse'},\n",
" {'gehoor', 'horen'},\n",
" {'benji', 'ji'},\n",
" {'kolberg', 'kołobrzeg'},\n",
" {'lateraal', 'laterale'},\n",
" {'derry', 'londonderry'},\n",
" {'deutschland', 'lützow'},\n",
" {'maximum', 'minimum'},\n",
" {'olifant', 'olifanten'},\n",
" {'orlando city', 'orlando city sc'},\n",
" {'palma', 'palma de mallorca'},\n",
" {'pcc-car', 'pcc-cars'},\n",
" {'pedro', 'pedro rodríguez'},\n",
" {'pk', 'pki'},\n",
" {'plesiosauriër', 'plesiosauriërs'},\n",
" {'racing club gent', 'racing gent'},\n",
" {'refuge', 'refugium'},\n",
" {'robben', 'zeehonden'},\n",
" {'sociaal netwerk', 'sociale netwerken'},\n",
" {'sonnenburg', 'słońsk'},\n",
" {'st helens', 'st. helens'},\n",
" {'stabilisator', 'stabilisatoren'},\n",
" {'stockholm', 'stockholm c'},\n",
" {'topschutter', 'topscorer'},\n",
" {'tram 4', 'tramlijn 4'},\n",
" {'lijn 9', 'tramlijn 9'},\n",
" {'trebnitz', 'trzebnica'},\n",
" {'west-friesland', 'west-friezen'},\n",
" {'alg', 'wieren'},\n",
" {'salix', 'wilgen'},\n",
" {'12 angry men', 'wel'},\n",
" {'49er', '49erfx'},\n",
" {'aartshertog van oostenrijk', 'aartshertogin van oostenrijk'},\n",
" {'afrika cup', 'afrikaans kampioenschap'},\n",
" {'alva', 'hertog van alva'},\n",
" {'asiel', 'asielprocedure'},\n",
" {'bogota', 'bogotá'},\n",
" {'cheeta', 'cheetah'},\n",
" {'copulatie', 'paren'},\n",
" {'dynamiek', 'dynamisch'},\n",
" {'ek onder 21', 'europees kampioenschap voetbal onder 21'},\n",
" {'elvis', 'elvis presley'},\n",
" {'eredivisionist', 'nederlandse eredivisie'},\n",
" {'conserveren', 'geconserveerd'},\n",
" {'geiser', 'geisers'},\n",
" {'golden globe award', 'golden globes'},\n",
" {'gudmundsson', 'guðmundsson'},\n",
" {'chip', 'ics'},\n",
" {'infiltratie', 'infiltreren'},\n",
" {'internacional', 'sc internacional'},\n",
" {'dag des oordeels', 'laatste oordeel'},\n",
" {'laodicea', 'latakia'},\n",
" {'libel', 'libellen'},\n",
" {'lijn 12', 'tramlijn 12'},\n",
" {'lijn 17', 'tramlijn 17'},\n",
" {'louise marie', 'louise van orléans'},\n",
" {'lemberg', 'lviv'},\n",
" {'kinshasa', 'léopoldville'},\n",
" {'lourdesgrot', 'mariagrot'},\n",
" {'matrijs', 'matrijzen'},\n",
" {'geometrie', 'meetkunde'},\n",
" {'mesopotamische', 'mesopotamië'},\n",
" {'landbouw, natuur en voedselkwaliteit',\n",
" 'ministerie van landbouw en visserij'},\n",
" {'moezel', 'mosel'},\n",
" {'most valuable player', 'mvp'},\n",
" {'datsun', 'nissan'},\n",
" {'ontginning', 'ontgonnen'},\n",
" {'oortje', 'oortjes'},\n",
" {'paardenstaart', 'paardenstaarten'},\n",
" {'alexander van oranje-nassau', 'prins alexander'},\n",
" {'racing', 'racing club'},\n",
" {'reflectie', 'reflector'},\n",
" {'ladies tour of norway', 'ronde van noorwegen'},\n",
" {'sterk', 'sterke'},\n",
" {'soemba', 'sumba'},\n",
" {'stazione termini', 'termini'},\n",
" {'spoortunnel', 'tunnels'},\n",
" {'verbrand', 'verbranding'},\n",
" {'vierkant', 'vierkante'},\n",
" {'waaier', 'waaiers'},\n",
" {'wereldkampioenschappen in 2003', 'wk in parijs'},\n",
" {'adelheid', 'adelheid van bourgondië'},\n",
" {'alia', 'aliyah'},\n",
" {'alternatieve', 'alternative'},\n",
" {'de gelaarsde kat', 'gelaarsde kat'},\n",
" {'de rode ridder', 'rode ridder'},\n",
" {'de waarheid', 'volksdagblad'},\n",
" {'derde persoon', 'eerste persoon'},\n",
" {'beëdigd', 'eed'},\n",
" {'erkend', 'erkenning'},\n",
" {'etage', 'etages'},\n",
" {'fortuna sittard', 'fsc'},\n",
" {'bouwwerk', 'gebouw'},\n",
" {'kalken', 'kalksteen'},\n",
" {'kan', 'khan'},\n",
" {'oculi', 'oculus'},\n",
" {'philippus', 'philippus ii'},\n",
" {'domproost', 'proosten'},\n",
" {'francs borains', 'r. francs borains'},\n",
" {'ritueel', 'rituelen'},\n",
" {'rodelbaan', 'rodelen'},\n",
" {'röntgen', 'röntgenstraling'},\n",
" {'elisabeth van hongarije', 'sint-elisabeth'},\n",
" {'stargard', 'stargard szczeciński'},\n",
" {'stekelvarken', 'stekelvarkens'},\n",
" {'moyland', 'till'},\n",
" {'moment', 'torsie'},\n",
" {'treptow', 'trzebiatów'},\n",
" {'constellation', 'uss constellation'},\n",
" {'vleet', 'want'},\n",
" {'nesten', 'vogelnest'},\n",
" {'voorstad', 'voorsteden'},\n",
" {'waarneming', 'waarnemingen'},\n",
" {'gereconstrueerd', 'wederopbouw'},\n",
" {'jongerenklassement', 'witte trui'},\n",
" {'zwak', 'zwakke'},\n",
" {'gp van zweden', 'zwe'},\n",
" {'aland', 'kurt'},\n",
" {'boezem', 'boezems'},\n",
" {'ccc polsat polkowice', 'ccc sprandi polkowice'},\n",
" {'chp', 'republikeinse volkspartij'},\n",
" {'correlatie', 'gecorreleerd'},\n",
" {'cynische', 'cynisme'},\n",
" {'dans', 'dansen'},\n",
" {'devon', 'devonshire'},\n",
" {'frederik willem', 'frederik willem i'},\n",
" {'galba', 'servius sulpicius galba'},\n",
" {'garibaldi', 'giuseppe garibaldi'},\n",
" {'haring', 'haringen'},\n",
" {'ambachtsheerlijkheid', 'heerlijkheid'},\n",
" {'kers', 'kersen'},\n",
" {'kiesdistrict', 'kiesdistricten'},\n",
" {'le hom', 'thury-harcourt'},\n",
" {'les villages vovéens', 'voves'},\n",
" {'luchtvrachtmaatschappij', 'vrachtluchtvaartmaatschappij'},\n",
" {'magnus eriksson', 'magnus ii'},\n",
" {'easy listening', 'middle of the road'},\n",
" {'mo', 'moguls'},\n",
" {'noordelijke', 'verenigde nederlanden'},\n",
" {'pd', 'pdl'},\n",
" {'pga championship', 'pga kampioenschap'},\n",
" {'gazprom-rusvelo', 'rusvelo'},\n",
" {'scanner', 'scanners'},\n",
" {'publius cornelius scipio', 'scipio'},\n",
" {'shogun', 'shogunaat'},\n",
" {'stand', 'standen'},\n",
" {'soeur sourire', 'sœur sourire'},\n",
" {'tactiek', 'tactische'},\n",
" {'tataarse', 'tataren'},\n",
" {'baroda', 'vadodara'},\n",
" {'vendel', 'vendels'},\n",
" {'villedieu-les-poêles', 'villedieu-les-poêles-rouffigny'},\n",
" {'afrikaans', 'afrikaanse'},\n",
" {'armada', 'spaanse armada'},\n",
" {'b&b', 'bed & breakfast'},\n",
" {'beursgenoteerd', 'beursgenoteerde'},\n",
" {'elbing', 'elbląg'},\n",
" {'emmerich', 'emmerik'},\n",
" {'europees kampioenschap voetbal', 'europees kampioenschap voetbal 2016'},\n",
" {'evangelische', 'evangelische kerk'},\n",
" {'fan', 'fans'},\n",
" {'fetisj', 'fetisjisme'},\n",
" {'girne', 'kyrenia'},\n",
" {'gracht', 'slotgracht'},\n",
" {'graden', '°'},\n",
" {'kwal', 'kwallen'},\n",
" {'glatz', 'kłodzko'},\n",
" {'l4', 'l6'},\n",
" {'lourenço marques', 'maputo'},\n",
" {'krijgskunst', 'martial arts'},\n",
" {'minister van onderwijs, kunsten en wetenschappen',\n",
" 'ministerie van onderwijs, kunsten en wetenschappen'},\n",
" {'moord', 'vermoord'},\n",
" {'myceense', 'myceners'},\n",
" {'nowodworski', 'powiat nowodworski'},\n",
" {'prins', 'prinses'},\n",
" {'prins-bisschop', 'prinsbisdom'},\n",
" {'radon', 'rn'},\n",
" {'rui', 'ruien'},\n",
" {'private', 'soldaten'},\n",
" {'speerwerpen', 'speerwerper'},\n",
" {'alpaca', 'alpacas'},\n",
" {'angst', 'vrees'},\n",
" {'bezirk', 'bezirke'},\n",
" {'bielski', 'powiat bielski'},\n",
" {'braam', 'bramen'},\n",
" {'chiraal', 'chirale'},\n",
" {'constante', 'constanten'},\n",
" {'de krim', 'krim'},\n",
" {'dialogen', 'dialoog'},\n",
" {'escort', 'ford escort'},\n",
" {'galerij', 'galerijen'},\n",
" {'gebeurtenis', 'gebeurtenissen'},\n",
" {'holstein-gottorp', 'sleeswijk-holstein-gottorp'},\n",
" {'infrarood', 'ir'},\n",
" {'john locke', 'locke'},\n",
" {'melaatsen', 'melaatsheid'},\n",
" {'melville', 'melville-eiland'},\n",
" {'microprocessor', 'processor'},\n",
" {'neil young', 'young'},\n",
" {'olympische spelen in barcelona', 'olympische zomerspelen 1992'},\n",
" {'opolski', 'powiat opolski'},\n",
" {'pola', 'pula'},\n",
" {'rondeel', 'rondelen'},\n",
" {'sint-helena', 'st. helena'},\n",
" {'saur-sojasun', 'sojasun'},\n",
" {'radioshack', 'team radioshack'},\n",
" {'tram 2', 'tramlijn 2'},\n",
" {'lijn 25', 'tramlijn 25'},\n",
" {'verpleegkunde', 'verpleegkundige'},\n",
" {'ver', 'verspringen'},\n",
" {'vrienden', 'vriendschap'},\n",
" {'chan', 'zen'},\n",
" {'aanbidding der wijzen', 'drie koningen'},\n",
" {'anatomie', 'anatoom'},\n",
" {'bekerwedstrijd', 'copa del rey'},\n",
" {'brzeski', 'powiat brzeski'},\n",
" {'digitaal', 'digitale'},\n",
" {'europese parlementsverkiezingen', 'europese verkiezingen'},\n",
" {'export', 'uitvoer'},\n",
" {'altona', 'hamburg-altona'},\n",
" {'hervormde', 'hervormde kerk'},\n",
" {'herzogenrath', 's-hertogenrade'},\n",
" {'indologie', 'indoloog'},\n",
" {'iason', 'jason'},\n",
" {'javaans', 'kawi'},\n",
" {'geodeet', 'landmeter'},\n",
" {'leenman', 'leenmannen'},\n",
" {'aunay-sur-odon', 'les monts daunay'},\n",
" {'lucia', 'santa lucia'},\n",
" {'evros', 'maritsa'},\n",
" {'mis', 'missen'},\n",
" {'nk afstanden', 'nk
afstanden'},\n",
" {'onthoofd', 'onthoofding'},\n",
" {'sint-servaasbasiliek', 'sint-servaaskerk'},\n",
" {'spongebob', 'spongebob squarepants'},\n",
" {'tand', 'tanden'},\n",
" {'tekenaar', 'tekenen'},\n",
" {'theaterschool', 'toneelschool'},\n",
" {'tiberius', 'tiberius claudius nero'},\n",
" {'anders', 'trappist'},\n",
" {'liga', 'voetbalcompetitie'},\n",
" {'weerwolf', 'weerwolven'},\n",
" {'grünberg', 'zielona góra'},\n",
" {'middengewicht', 'zwaargewicht'},\n",
" {'alexandrijnse', 'alexandrië'},\n",
" {'bonen', 'boon'},\n",
" {'congruent', 'congruentie'},\n",
" {'die grünen', 'groenen'},\n",
" {'axis', 'draaier'},\n",
" {'ecologisch', 'ecologische'},\n",
" {'ellips', 'elliptische'},\n",
" {'fabrikant', 'industrieel'},\n",
" {'foto', 'fotos'},\n",
" {'glaciale', 'ijstijden'},\n",
" {'handvest', 'manifest'},\n",
" {'leeg', 'leegte'},\n",
" {'olympische spelen van 1968', 'olympische zomerspelen 1968'},\n",
" {'roest', 'roesten'},\n",
" {'ruprecht', 'ruprecht van de palts'},\n",
" {'symmetrie', 'symmetrisch'},\n",
" {'trivia', 'triviaal'},\n",
" {'hallescher fc wacker', 'wacker halle'},\n",
" {'woord', 'woorden'},\n",
" {'3000 m steeplechase', 'steeplechase'},\n",
" {'accommodatie', 'accommoderen'},\n",
" {'alpecin-fenix', 'corendon-circus'},\n",
" {'anale', 'anus'},\n",
" {'apartheid', 'apartheidsregime'},\n",
" {'boeddhisme', 'boeddhistische'},\n",
" {'boei', 'boeien'},\n",
" {'brantôme', 'brantôme en périgord'},\n",
" {'certificaat', 'certificaten'},\n",
" {'de zeven provinciën', 'hr.ms. de zeven provinciën'},\n",
" {'boekdrukker', 'drukker'},\n",
" {'jacob', 'jakob'},\n",
" {'katholieke universiteit', 'ru'},\n",
" {'kea', 'keos'},\n",
" {'la', 'los angeles'},\n",
" {'middeleeuwen', 'middeleeuwse'},\n",
" {'minister van onderwijs', 'ministerie van onderwijs'},\n",
" {'moderne', 'moderne kunst'},\n",
" {'natuurwet', 'natuurwetten'},\n",
" {'sneeuwwitje', 'sneeuwwitje en de zeven dwergen'},\n",
" {'magiër', 'tovenaar'},\n",
" {'tsaar', 'tsaristische'},\n",
" {'tuig', 'tuigage'},\n",
" {'unitarisme', 'unitariërs'},\n",
" {'verontreiniging', 'vervuiling'},\n",
" {'licht', 'zichtbaar licht'},\n",
" {'het zwin', 'zwin'},\n",
" {'aasgarnalen', 'aasgarnalensoort'},\n",
" {'brahmana', 'brahmanas'},\n",
" {'categorie', 'categorieën'},\n",
" {'doop', 'gedoopt'},\n",
" {'etsen', 'etser'},\n",
" {'gang', 'gangen'},\n",
" {'gisting', 'vergisting'},\n",
" {'goederenvervoer', 'transport'},\n",
" {'gondel', 'gondels'},\n",
" {'hydraulisch', 'hydraulische'},\n",
" {'geïnterneerd', 'internering'},\n",
" {'koninginnedag', 'koningsdag'},\n",
" {'opéra comique', 'opéra-comique'},\n",
" {'lomonosov', 'oranienbaum'},\n",
" {'pees', 'pezen'},\n",
" {'pot', 'potten'},\n",
" {'pylonen', 'pyloon'},\n",
" {'stadia', 'stadium'},\n",
" {'gooi', 't gooi'},\n",
" {'urinoir', 'urinoirs'},\n",
" {'fc wacker innsbruck', 'wacker innsbruck'},\n",
" {'achtergrond', 'achtergrondzang'},\n",
" {'belgisch kampioen', 'belgisch kampioenschap'},\n",
" {'chopper', 'choppers'},\n",
" {'constitutie', 'constitutionele'},\n",
" {'episch', 'epische'},\n",
" {'first-person', 'fps'},\n",
" {'hortus', 'hortus botanicus'},\n",
" {'jan verheyen', 'verheyen'},\n",
" {'landskampioenschap van rusland', 'russische superliga'},\n",
" {'lijn 6', 'tramlijn 6'},\n",
" {'locomotief', 'locomotieven'},\n",
" {'logica', 'logische'},\n",
" {'oceanische', 'oceanië'},\n",
" ...]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(clusters, key=len)[::-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75340f6b-359f-412c-a8c4-aa32e42d2a67",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}