{ "cells": [ { "cell_type": "markdown", "id": "9b16b625-0856-42fa-b61f-f14ae31eab55", "metadata": {}, "source": [ "# Surfaceform clusters" ] }, { "cell_type": "code", "execution_count": null, "id": "c9dacaf9-0413-468a-bd9f-d86e5d7123fd", "metadata": {}, "outputs": [], "source": [ "import json, math, tqdm\n", "\n", "fname = \"wiki/nlwiki-20220301/experiments/clean-q0.25.json\"\n", "anchor_scores = json.load(open(fname))\n", "# log transform\n", "anchor_scores = {\n", " a: {e: math.log1p(c) for e, c in ec.items()} for a, ec in anchor_scores.items()\n", "}\n", "# l2 normalize\n", "anchor_scores = {\n", " a: {\n", " e: c / t\n", " for t in [sum(v**2 for v in ec.values()) ** 0.5]\n", " for e, c in ec.items()\n", " }\n", " for a, ec in anchor_scores.items()\n", "}\n", "len(anchor_scores)" ] }, { "cell_type": "code", "execution_count": 14, "id": "a90af9e2-62e3-4217-a748-4861b70be7fa", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30472/30472 [00:00<00:00, 131802.04it/s]\n" ] }, { "data": { "text/plain": [ "(1900, 4902)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score_threshold = 0.5\n", "\n", "id_anchors = {}\n", "for a, es in anchor_scores.items():\n", " for e in es:\n", " id_anchors.setdefault(e, set()).add(a)\n", "\n", "surface_cluster = {i: i for i in anchor_scores}\n", "for a, es in tqdm.tqdm(anchor_scores.items()):\n", " others = set.union(*[id_anchors[e] for e in es]) - set([a])\n", " for o in others:\n", " x, y = set(anchor_scores[o]), set(es)\n", " # score = len(x&y) / len(x|y) # jacc\n", " score = sum(anchor_scores[o][v] * es[v] for v in x & y) # cosine\n", " if score > score_threshold:\n", " surface_cluster[o] = surface_cluster[a]\n", " # print(f'{a:20s} {o:20s}', score )\n", "\n", "clusters = {}\n", "for s, c in surface_cluster.items():\n", " clusters.setdefault(c, set()).add(s)\n", "clusters = [ss for ss in clusters.values() if len(ss) > 1]\n", "len(clusters), len(set(s for c in clusters for s in c))" ] }, { "cell_type": "code", "execution_count": 15, "id": "a58df8c2-cb67-41cc-b8fa-5c3a52c717e8", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[{'cannondale pro cycling team',\n", " 'cannondale-drapac pro cycling team',\n", " 'ef education first pro cycling',\n", " 'ef education-easypost',\n", " 'ef education-nippo',\n", " 'garmin',\n", " 'garmin sharp',\n", " 'garmin-sharp',\n", " 'garmin-slipstream',\n", " 'garmin-transitions',\n", " 'team cannondale-garmin',\n", " 'team ef education first-drapac p/b cannondale',\n", " 'team garmin-cervélo',\n", " 'team garmin-sharp',\n", " 'team garmin-transitions'},\n", " {'amateurvoetbalclub',\n", " 'amateurvoetbalvereniging',\n", " 'bekertoernooi',\n", " 'clubs',\n", " 'profvoetbalclub',\n", " 'profvoetballer',\n", " 'voetbal',\n", " 'voetbalbeker',\n", " 'voetbalbond',\n", " 'voetbalclub',\n", " 'voetballer',\n", " 'voetballers',\n", " 'voetbalster',\n", " 'voetbalvereniging'},\n", " {'frankrijk',\n", " 'franse',\n", " 'franse republiek',\n", " 'franse revolutionaire',\n", " 'fransen',\n", " 'fransman',\n", " 'noord-frankrijk',\n", " 'noord-franse',\n", " 'revolutionaire',\n", " 'revolutionaire frankrijk',\n", " 'revolutionairen',\n", " 'zuid-frankrijk',\n", " 'zuid-franse'},\n", " {'amerika',\n", " 'amerikaanse',\n", " 'amerikaanse staat',\n", " 'amerikaanse staten',\n", " 'amerikanen',\n", " 'de verenigde staten',\n", " 'u.s.',\n", " 'united states',\n", " 'usa',\n", " 'verenigde staten',\n", " 'vs'},\n", " {'brit',\n", " 'brits',\n", " 'britse',\n", " 'britten',\n", " 'engeland',\n", " 'engelse',\n", " 'engelsen',\n", " 'groot-brittannië',\n", " 'uk',\n", " 'verenigd koninkrijk',\n", " 'vk'},\n", " {'bondsrepubliek',\n", " 'bondsrepubliek duitsland',\n", " 'duits',\n", " 'duits voetbalelftal',\n", " 'duitse',\n", " 'duitse afkomst',\n", " 'duitser',\n", " 'duitsers',\n", " 'nationale elftal',\n", " 'west-duitse',\n", " 'west-duitsland'},\n", " {'componeerde',\n", " 'componeren',\n", " 'componist',\n", " 'compositie',\n", " 'composities',\n", " 'gecomponeerd',\n", " 'liederen',\n", " 'nummer',\n", " 'nummers',\n", " 'track'},\n", " {'de oost',\n", " 'india',\n", " 'indiase',\n", " 'indisch',\n", " 'indische',\n", " 'indië',\n", " 'indiërs',\n", " 'indo',\n", " 'nederlands-indië',\n", " 'oost-indië'},\n", " {'katholicisme',\n", " 'katholiek',\n", " 'katholieke',\n", " 'katholieke geloof',\n", " 'katholieke kerk',\n", " 'katholieken',\n", " 'kerk',\n", " 'kerkelijke',\n", " 'rooms-katholiek',\n", " 'rooms-katholieke'},\n", " {'rome',\n", " 'romein',\n", " 'romeinen',\n", " 'romeins',\n", " 'romeinse',\n", " 'romeinse keizerrijk',\n", " 'romeinse keizertijd',\n", " 'romeinse oudheid',\n", " 'romeinse periode',\n", " 'romeinse tijd'},\n", " {'olympische spelen 2020',\n", " 'olympische spelen in tokio',\n", " 'olympische spelen van 1964',\n", " 'olympische spelen van tokio',\n", " 'olympische zomerspelen 1964',\n", " 'olympische zomerspelen 2020',\n", " 'os 2020',\n", " 'tokio 2020',\n", " 'zomerspelen van 1964'},\n", " {'deceuninck',\n", " 'deceuninck–quick-step',\n", " 'etixx-quick step',\n", " 'omega pharma-quick step',\n", " 'quick step',\n", " 'quick step-alpha vinyl',\n", " 'quick-step',\n", " 'quick-step floors',\n", " 'quickstep'},\n", " {'csc',\n", " 'saxo bank',\n", " 'saxo bank-sungard',\n", " 'team csc',\n", " 'team csc saxo bank',\n", " 'team saxo bank',\n", " 'team saxo-tinkoff',\n", " 'tinkoff',\n", " 'tinkoff-saxo'},\n", " {'koloniaal',\n", " 'koloniale',\n", " 'kolonialisme',\n", " 'kolonie',\n", " 'kolonies',\n", " 'kolonist',\n", " 'kolonisten',\n", " 'koloniën',\n", " 'pionier'},\n", " {'bezetting',\n", " 'duitse bezetter',\n", " 'duitse bezetters',\n", " 'duitse bezetting',\n", " 'nazi',\n", " 'naziregime',\n", " 'nazis',\n", " 'oorlogsjaren',\n", " 'tweede wereldoorlog'},\n", " {'aanval',\n", " 'aanvaller',\n", " 'av',\n", " 'centrumspits',\n", " 'linksbuiten',\n", " 'rechtsbuiten',\n", " 'schaduwspits',\n", " 'spits',\n", " 'vleugelaanvaller'},\n", " {'gr.',\n", " 'grieken',\n", " 'grieks',\n", " 'griekse',\n", " 'griekse oudheid',\n", " 'griekse taal',\n", " 'oud-griekse',\n", " 'oude grieken',\n", " 'oudgrieks'},\n", " {'italiaan',\n", " 'italiaans',\n", " 'italiaanse',\n", " 'italiaanse republiek',\n", " 'italianen',\n", " 'noord-italiaanse',\n", " 'noord-italië',\n", " 'zuid-italiaanse',\n", " 'zuid-italië'},\n", " {'lampre',\n", " 'lampre-caffita',\n", " 'lampre-farnese vini',\n", " 'lampre-fondital',\n", " 'lampre-isd',\n", " 'lampre-merida',\n", " 'lampre-ngc',\n", " 'uae team emirates'},\n", " {'cd',\n", " 'cd-single',\n", " 'cds',\n", " 'debuut',\n", " 'debuutsingle',\n", " 'single',\n", " 'singles',\n", " 'vinylsingle'},\n", " {'argos-shimano',\n", " 'bankgiroloterij',\n", " 'giant-shimano',\n", " 'skil-shimano',\n", " 'team dsm',\n", " 'team giant-alpecin',\n", " 'team giant-shimano',\n", " 'team sunweb'},\n", " {'federaal',\n", " 'federale',\n", " 'federale overheid',\n", " 'federale republiek',\n", " 'federalist',\n", " 'federalisten',\n", " 'federalistische',\n", " 'federatie'},\n", " {'ultratop',\n", " 'ultratop 200',\n", " 'ultratop 50',\n", " 'vlaams',\n", " 'vlaamse',\n", " 'vlaanderen',\n", " 'vlaming',\n", " 'vlamingen'},\n", " {'beroepswielrenner',\n", " 'weg',\n", " 'wielerploeg',\n", " 'wielersport',\n", " 'wielerwedstrijd',\n", " 'wielrennen',\n", " 'wielrenner',\n", " 'wielrenster'},\n", " {'bonjour',\n", " 'bouygues télécom',\n", " 'direct énergie',\n", " 'europcar',\n", " 'team europcar',\n", " 'team totalenergies',\n", " 'total direct energie'},\n", " {'fdj',\n", " 'fdj-bigmat',\n", " 'fdj.fr',\n", " 'fdjeux.com',\n", " 'française des jeux',\n", " 'groupama-fdj',\n", " 'la française des jeux'},\n", " {'academici',\n", " 'academicus',\n", " 'academisch',\n", " 'academische',\n", " 'universiteit',\n", " 'universiteiten',\n", " 'wo'},\n", " {'golfbaan',\n", " 'golfclub',\n", " 'golfclubs',\n", " 'golfer',\n", " 'golfprofessional',\n", " 'golfterrein',\n", " 'professional'},\n", " {'londen 2012',\n", " 'olympische spelen 2012',\n", " 'olympische spelen in londen',\n", " 'olympische spelen van 2012',\n", " 'olympische spelen van londen',\n", " 'olympische zomerspelen 2012',\n", " 'olympische zomerspelen van 2012'},\n", " {'koninkrijk',\n", " 'koninkrijken',\n", " 'monarchie',\n", " 'monarchist',\n", " 'monarchisten',\n", " 'monarchistische',\n", " 'royalisten'},\n", " {'congo',\n", " 'congo-brazzaville',\n", " 'congo-kinshasa',\n", " 'congolees',\n", " 'congolese',\n", " 'kongo',\n", " 'republiek congo'},\n", " {'linkervleugelverdediger',\n", " 'linksachter',\n", " 'linksback',\n", " 'rechtervleugelverdediger',\n", " 'rechtsachter',\n", " 'rechtsback',\n", " 'vleugelverdediger'},\n", " {'kamer',\n", " 'kamerlid',\n", " 'kamerverkiezingen',\n", " 'tweede kamerfractie',\n", " 'tweede kamerlid',\n", " 'verkiezingen van 2014',\n", " 'verkiezingen van 25 mei 2014'},\n", " {'dagvlinder',\n", " 'mot',\n", " 'motten',\n", " 'nachtvlinder',\n", " 'nachtvlinders',\n", " 'vlinder',\n", " 'vlinders'},\n", " {'beuk',\n", " 'beuken',\n", " 'driebeukige',\n", " 'kerkschip',\n", " 'schip',\n", " 'zijbeuken',\n", " 'zijschip'},\n", " {'beschermd monument',\n", " 'monument',\n", " 'monumentaal',\n", " 'monumentale',\n", " 'monumenten',\n", " 'monumentenlijst',\n", " 'onroerend erfgoed'},\n", " {'film',\n", " 'film-',\n", " 'filmmaker',\n", " 'geregisseerd',\n", " 'regie',\n", " 'regisseur',\n", " 'regisseuse'},\n", " {'nederlander',\n", " 'nederlanders',\n", " 'nederlands',\n", " 'nederlandse',\n", " 'nederlandstalig',\n", " 'nederlandstalige',\n", " 'nl'},\n", " {'davitamon-lotto',\n", " 'lotto soudal',\n", " 'lotto-belisol',\n", " 'omega pharma-lotto',\n", " 'predictor-lotto',\n", " 'silence-lotto'},\n", " {'olympische spelen 2016',\n", " 'olympische spelen van 2016',\n", " 'olympische zomerspelen 2016',\n", " 'olympische zomerspelen van 2016',\n", " 'rio 2016',\n", " 'rio de janeiro 2016'},\n", " {'langebaan',\n", " 'langebaanschaatser',\n", " 'schaats',\n", " 'schaatsen',\n", " 'schaatser',\n", " 'schaatsster'},\n", " {'gif', 'giftig', 'giftige', 'giftigheid', 'toxisch', 'toxische'},\n", " {'warner',\n", " 'warner bros',\n", " 'warner bros.',\n", " 'warner bros. pictures',\n", " 'warner bros. records',\n", " 'warner brothers'},\n", " {'elpee', 'lp', 'lps', 'plaat', 'platen', 'vinyl'},\n", " {'gereformeerd',\n", " 'gereformeerde',\n", " 'gereformeerde kerk',\n", " 'gereformeerde kerken',\n", " 'gereformeerde kerken in nederland',\n", " 'gereformeerden'},\n", " {'neoclassicisme',\n", " 'neoclassicistisch',\n", " 'neoclassicistische',\n", " 'neoclassicistische stijl',\n", " 'neoklassieke',\n", " 'neoklassieke stijl'},\n", " {'krijgsmacht', 'landmacht', 'leger', 'legers', 'militair', 'militaire'},\n", " {'organist', 'orgel', 'orgelpijpen', 'orgels', 'pijpen', 'pijpwerk'},\n", " {'verzet',\n", " 'verzetsgroep',\n", " 'verzetsman',\n", " 'verzetsstrijder',\n", " 'verzetsstrijders',\n", " 'verzetsstrijdster'},\n", " {'economen',\n", " 'economie',\n", " 'economisch',\n", " 'economische',\n", " 'economische wetenschappen',\n", " 'econoom'},\n", " {'palestijnen',\n", " 'palestijns',\n", " 'palestijnse',\n", " 'palestijnse gebieden',\n", " 'palestijnse staat',\n", " 'palestina'},\n", " {'joden', 'jodium', 'jood', 'joods', 'joodse', 'joodse gemeenschap'},\n", " {'ier', 'ieren', 'ierland', 'iers', 'ierse', 'ierse republiek'},\n", " {'deen', 'deens', 'deense', 'denemarken', 'denen', 'koninkrijk denemarken'},\n", " {'rus', 'rusland', 'russen', 'russisch', 'russische', 'russische rijk'},\n", " {'koninkrijk zweden', 'swe', 'zweden', 'zweed', 'zweeds', 'zweedse'},\n", " {'bardiani csf',\n", " 'bardiani valvole-csf inox',\n", " 'bardiani-csf-faizanè',\n", " 'colnago-csf inox',\n", " 'csf group-navigare'},\n", " {'communist', 'communiste', 'communisten', 'communistisch', 'pcb'},\n", " {'aves', 'avifauna', 'gevogelte', 'vogel', 'vogels'},\n", " {'farnese vini-neri sottoli',\n", " 'neri sottoli',\n", " 'southeast',\n", " 'vini fantini-selle italia',\n", " 'wilier triestina-southeast'},\n", " {'leopard trek',\n", " 'radioshack leopard',\n", " 'radioshack-leopard',\n", " 'radioshack-nissan-trek',\n", " 'trek factory racing'},\n", " {'htc-highroad',\n", " 't-mobile team',\n", " 'team columbia',\n", " 'team htc-columbia',\n", " 'team stuttgart'},\n", " {'wereldkampioenschappen sprint',\n", " 'wk
sprint',\n", " 'wk
sprint',\n", " 'wk sprint',\n", " 'wk
sprint'},\n", " {'buis', 'buizen', 'leidingen', 'pijpleiding', 'pijpleidingen'},\n", " {'hallucinogene',\n", " 'psychedelica',\n", " 'psychedelisch',\n", " 'psychedelische',\n", " 'psychedelische rock'},\n", " {'ineos grenadiers', 'sky', 'sky procycling', 'team ineos', 'team sky'},\n", " {'astana pro team',\n", " 'astana qazaqstan',\n", " 'astana-premier tech',\n", " 'liberty seguros-würth',\n", " 'pro team astana'},\n", " {'erfelijk', 'erfelijke', 'erfelijkheid', 'geneticus', 'genetisch'},\n", " {'olympisch',\n", " 'olympisch kampioen',\n", " 'olympisch kampioene',\n", " 'olympische',\n", " 'spelen'},\n", " {'olympische spelen 2008',\n", " 'olympische spelen in peking',\n", " 'olympische spelen van peking',\n", " 'olympische zomerspelen 2008',\n", " 'peking 2008'},\n", " {'ghz', 'hertz', 'hz', 'khz', 'mhz'},\n", " {'magneet', 'magneten', 'magnetisch', 'magnetisch veld', 'magnetische'},\n", " {'gasthoogleraar', 'gewoon hoogleraar', 'hoogleraar', 'prof.', 'professor'},\n", " {'evangelisch-luthers',\n", " 'evangelisch-lutherse',\n", " 'evangelisch-lutherse kerk',\n", " 'luthers',\n", " 'lutherse'},\n", " {'mitchelton-scott',\n", " 'orica greenedge',\n", " 'orica-bikeexchange',\n", " 'orica-scott',\n", " 'team bikeexchange'},\n", " {'fl', 'fl.', 'florida', 'gulden', 'ƒ'},\n", " {'middelbaar onderwijs',\n", " 'middelbare',\n", " 'middelbare scholen',\n", " 'middelbare school',\n", " 'secundair'},\n", " {'habsburg', 'habsburgers', 'habsburgs', 'habsburgse', 'habsburgse rijk'},\n", " {'astrologen', 'astrologie', 'astrologisch', 'astrologische', 'astroloog'},\n", " {'belasting', 'belastingen', 'fiscaal', 'fiscale', 'schatting'},\n", " {'gelatiniseerd', 'gelatiniseerde', 'latijn', 'latijns', 'latijnse'},\n", " {'wereldkampioenschappen voor junioren',\n", " 'wjk',\n", " 'wk junioren',\n", " 'wk u20',\n", " 'wk voor junioren'},\n", " {'wereldkampioenschappen afstanden',\n", " 'wk
afstanden',\n", " 'wk
afstanden',\n", " 'wk afstanden',\n", " 'wk
afstanden'},\n", " {'elisabeth',\n", " 'elisabeth in beieren',\n", " 'elisabeth van belgië',\n", " 'koningin elisabeth',\n", " 'prinses elisabeth'},\n", " {'ferry', 'pont', 'veer', 'veerdienst', 'veerpont'},\n", " {'babylon', 'babylonisch', 'babylonische', 'babylonië', 'babyloniërs'},\n", " {'nederlandse publieke omroep',\n", " 'npo',\n", " 'openbare omroep',\n", " 'publieke',\n", " 'publieke omroep'},\n", " {'gelder', 'gelderland', 'gelders', 'gelderse', 'gelre'},\n", " {'organisch',\n", " 'organische',\n", " 'organische stoffen',\n", " 'organische verbinding',\n", " 'organische verbindingen'},\n", " {'electronic', 'electronica', 'elektronica', 'elektronisch', 'elektronische'},\n", " {'dokter', 'geneeskunde', 'geneeskundige', 'medicijnen', 'medische'},\n", " {'basisonderwijs',\n", " 'basisscholen',\n", " 'basisschool',\n", " 'lager onderwijs',\n", " 'lagere school'},\n", " {'schaak', 'schaakpartij', 'schaakster', 'schaakvereniging', 'schaken'},\n", " {'gotiek', 'gotisch', 'gotische', 'laatgotisch', 'laatgotische'},\n", " {'rugby', 'rugbyclub', 'rugbyers', 'rugbyspeler', 'rugbyteam'},\n", " {'genova', 'genua', 'genuees', 'genuese', 'genuezen'},\n", " {'be', 'belg', 'belgen', 'belgisch', 'belgische'},\n", " {'humanisme', 'humanist', 'humanisten', 'humanistisch', 'humanistische'},\n", " {'siam', 'thai', 'thailand', 'thais', 'thaise'},\n", " {'beschrijving', 'wapen', 'wapens', 'wapenschild', 'wapenschilden'},\n", " {'perzen', 'perzisch', 'perzische', 'perzische rijk', 'perzië'},\n", " {'klinker', 'klinkers', 'vocalen', 'zang', 'zangers'},\n", " {'filosofe', 'filosofen', 'filosofie', 'filosofisch', 'filosoof'},\n", " {'let', 'letland', 'lets', 'letse', 'republiek letland'},\n", " {'litouwen', 'litouwers', 'litouws', 'litouwse', 'ltu'},\n", " {'bas', 'basgitaar', 'basgitarist', 'bass', 'bassist'},\n", " {'bosnisch',\n", " 'bosnische',\n", " 'bosnië',\n", " 'bosnië en herzegovina',\n", " 'bosnië-herzegovina'},\n", " {'mar', 'marokkaan', 'marokkaans', 'marokkaanse', 'marokko'},\n", " {'barok', 'barokke', 'barokperiode', 'barokschilder', 'barokstijl'},\n", " {'serven', 'servisch', 'servische', 'servië', 'serviër'},\n", " {'mathematica', 'wis', 'wis-', 'wiskunde', 'wiskundige'},\n", " {'doel', 'doelman', 'doelvrouw', 'gk', 'keeper'},\n", " {'show', 'theater', 'theatermaker', 'theaterstukken', 'toneel'},\n", " {'noor', 'noors', 'noorse', 'nor', 'noren'},\n", " {'hongaar', 'hongaars', 'hongaarse', 'hongaren', 'hongarije'},\n", " {'fin', 'finland', 'finnen', 'fins', 'finse'},\n", " {'auteur', 'publicist', 'schrijfster', 'schrijver', 'schrijvers'},\n", " {'limburg', 'limburgers', 'limburgs', 'limburgse', 'provincie limburg'},\n", " {'japan', 'japanner', 'japanners', 'japans', 'japanse'},\n", " {'spaans', 'spaanse', 'spanjaard', 'spanjaarden', 'spanje'},\n", " {'ingepolderd', 'inpoldering', 'polder', 'polders'},\n", " {'footon-servetto', 'fuji-servetto', 'geox-tmc', 'scott-american beef'},\n", " {'androni giocattoli-sidermec',\n", " 'androni giocattoli-venezuela',\n", " 'drone hopper-androni giocattoli',\n", " 'selle italia'},\n", " {'huis', 'woning', 'woningen', 'woonhuis'},\n", " {'mythen', 'mythische', 'mythologie', 'mythologische'},\n", " {'roompot oranje peloton',\n", " 'roompot-charles',\n", " 'roompot-nederlandse loterij',\n", " 'roompot-oranje peloton'},\n", " {'autocoureur', 'coureur', 'coureurs', 'motorcoureur'},\n", " {'geïndustrialiseerde', 'industrialisatie', 'industrie', 'industriële'},\n", " {'eedgenootschap',\n", " 'eedgenoten',\n", " 'zwitsers eedgenootschap',\n", " 'zwitserse eedgenootschap'},\n", " {'accent.jobs-willems verandas',\n", " 'circus-wanty-gobert',\n", " 'intermarché-wanty-gobert matériaux',\n", " 'wanty-groupe gobert'},\n", " {'doctor', 'doctoraat', 'doctorstitel', 'ph.d.'},\n", " {'gymnast', 'gymnaste', 'gymnastiek', 'turner'},\n", " {'plant', 'planten', 'plantensoort', 'species'},\n", " {'padvinderij', 'scout', 'scouting', 'scouts'},\n", " {'alpineskiester', 'alpineskiën', 'alpineskiër', 'skiër'},\n", " {'institutie', 'instituties', 'instituut', 'organisatie'},\n", " {'afgezet', 'afzetting', 'afzettingen', 'sedimenten'},\n", " {'retorica', 'retoriek', 'retorisch', 'retorische'},\n", " {'bayern', 'bayern münchen', 'bayern münchen ii', 'fc bayern münchen'},\n", " {'bisdom', 'diocees', 'diocesaan', 'diocesane'},\n", " {'bladsteel', 'bladstelen', 'bloemstengel', 'knol'},\n", " {'nationale team', 'nederlands handbalteam', 'nederlands team', 'senioren'},\n", " {'mtn-qhubeka',\n", " 'ntt pro cycling',\n", " 'team dimension data',\n", " 'team qhubeka-assos'},\n", " {'gepubliceerd', 'publicatie', 'publiceerde', 'verhandeling'},\n", " {'ruiter', 'ruiters', 'springen', 'springruiter'},\n", " {'westen', 'westers', 'westerse', 'westerse beschaving'},\n", " {'academie van beeldende kunsten',\n", " 'academie voor beeldende kunsten',\n", " 'koninklijke academie',\n", " 'koninklijke academie van beeldende kunsten'},\n", " {'wereldkampioenschappen allround',\n", " 'wk
allround',\n", " 'wk
allround',\n", " 'wk
allround'},\n", " {'aartsbisdom utrecht',\n", " 'aartsbisschop van utrecht',\n", " 'bisdom utrecht',\n", " 'bisschop van utrecht'},\n", " {'vesting', 'vestingwal', 'wal', 'wallen'},\n", " {'olympische spelen 2000',\n", " 'olympische spelen in sydney',\n", " 'olympische spelen van sydney',\n", " 'olympische zomerspelen 2000'},\n", " {'maximiliaan',\n", " 'maximiliaan i',\n", " 'maximiliaan van habsburg',\n", " 'maximiliaan van oostenrijk'},\n", " {'athene 2004',\n", " 'olympische spelen in athene',\n", " 'olympische spelen van athene',\n", " 'olympische zomerspelen 2004'},\n", " {'manufacturen', 'textiel', 'textielfabriek', 'weefsel'},\n", " {'katjoesja',\n", " 'team katjoesja',\n", " 'team katjoesja alpecin',\n", " 'tinkoff credit systems'},\n", " {'gbr fea', 'gbr spr', 'sil', 'silverstone'},\n", " {'belkin', 'jumbo-visma', 'team jumbo-visma', 'team lottonl-jumbo'},\n", " {'cannondale', 'liquigas', 'liquigas-cannondale', 'liquigas-doimo'},\n", " {'impressionisme',\n", " 'impressionisten',\n", " 'impressionistisch',\n", " 'impressionistische'},\n", " {'bourgondische',\n", " 'bourgondiërs',\n", " 'hertog van bourgondië',\n", " 'hertogen van bourgondië'},\n", " {'gemeenteraad', 'gemeenteraadslid', 'raad', 'raadslid'},\n", " {'staats', 'staatse', 'staatse leger', 'staatse troepen'},\n", " {'carnivore', 'carnivoren', 'roofdieren', 'vleesetende'},\n", " {'patriot', 'patriotten', 'patriottisch', 'patriottische'},\n", " {'individuele tijdrit', 'itt', 'tijdrijden', 'tijdrit'},\n", " {'klassiek', 'klassieke', 'klassieke muziek', 'muziekgeschiedenis'},\n", " {'gesynthetiseerd', 'synthese', 'synthetisch', 'synthetische'},\n", " {'zuid-', 'zuid-amerika', 'zuid-amerikaans', 'zuid-amerikaanse'},\n", " {'ratio', 'rationeel', 'rationele', 'rede'},\n", " {'gereduceerd', 'reducerende', 'reducerende stoffen', 'reductie'},\n", " {'orangisme', 'orangist', 'orangisten', 'orangistische'},\n", " {'caisse depargne',\n", " 'caisse depargne-illes balears',\n", " 'movistar',\n", " 'movistar team'},\n", " {'diplomaat', 'diplomaten', 'diplomatie', 'gezant'},\n", " {'biografie', 'biografieën', 'biografisch', 'biografische'},\n", " {'inheems', 'inheemse', 'inheemse bevolking', 'inheemsen'},\n", " {'heiden', 'heidenen', 'heidens', 'heidense'},\n", " {'republikein', 'republikeinen', 'republikeins', 'republikeinse'},\n", " {'socialisme', 'socialisten', 'socialistisch', 'socialistische'},\n", " {'assur', 'assyrische', 'assyriërs', 'aššur'},\n", " {'allegorie', 'allegorieën', 'allegorisch', 'allegorische'},\n", " {'aanslag', 'aanslagen', 'terreur', 'terroristische aanslagen'},\n", " {'afghaans', 'afghanen', 'afghanistan', 'islamitisch emiraat afghanistan'},\n", " {'album', 'albums', 'soloalbum', 'studioalbum'},\n", " {'laatromaanse', 'romaans', 'romaanse', 'romaanse stijl'},\n", " {'hulpkerk', 'kapel', 'kapelletje', 'veldkapel'},\n", " {'heilig', 'heilig verklaard', 'heilige', 'heiligverklaring'},\n", " {'archeologe', 'archeologen', 'archeologisch onderzoek', 'archeoloog'},\n", " {'afbeeldingen', 'grafiek', 'prent', 'prenten'},\n", " {'app', 'applicatie', 'applicaties', 'apps'},\n", " {'space shuttle', 'spaceshuttle', 'spaceshuttlemissie', 'spaceshuttles'},\n", " {'europa cup', 'europa cup i', 'europacup', 'europacup i'},\n", " {'byzantijns', 'byzantijnse', 'byzantium', 'constantinopel'},\n", " {'marine', 'marineofficier', 'nederlandse marine', 'officier'},\n", " {'beeld', 'beelden', 'sculpturen', 'sculptuur'},\n", " {'islam', 'islamitische', 'moslim', 'moslims'},\n", " {'amazigh', 'berber', 'berbers', 'berberse'},\n", " {'cetacea', 'walvis', 'walvisachtigen', 'walvissen'},\n", " {'arabier', 'arabieren', 'arabisch', 'arabische'},\n", " {'europa', 'europees', 'europese', 'europese vasteland'},\n", " {'kunstschilder', 'kunstschilders', 'schilder', 'schilderen'},\n", " {'fruit', 'schil', 'vrucht', 'vruchten'},\n", " {'martelaar', 'martelaren', 'martelares', 'marteldood'},\n", " {'disney', 'disneyfilm', 'disneys', 'walt disney'},\n", " {'eclecticisme', 'eclectisch', 'eclectische', 'eclectische stijl'},\n", " {'genus', 'geslacht', 'geslachten', 'geslachtsnaam'},\n", " {'normandisch', 'normandische', 'normandië', 'normandiërs'},\n", " {'slaaf', 'slaven', 'slavenhandel', 'slavernij'},\n", " {'coach', 'trainer', 'voetbalcoach', 'voetbaltrainer'},\n", " {'activisme', 'activist', 'activisten', 'activistische'},\n", " {'coronacrisis', 'coronapandemie', 'coronavirus', 'covid-19'},\n", " {'evolutie', 'evolutionair', 'evolutionaire', 'geëvolueerd'},\n", " {'dictator', 'dictatoriaal', 'dictatoriale', 'dictatuur'},\n", " {'beieren', 'beiers', 'beierse', 'hertogdom beieren'},\n", " {'christelijk', 'christelijke', 'christen', 'christenen'},\n", " {'dieren', 'dierenrijk', 'diersoorten', 'fauna'},\n", " {'democraat', 'democraten', 'democratisch', 'democratische'},\n", " {'tropen', 'tropisch', 'tropische', 'tropische gebieden'},\n", " {'catalaans', 'catalaanse', 'catalanen', 'catalonië'},\n", " {'aartsbisschop van keulen', 'keulen', 'keulse', 'köln'},\n", " {'christus', 'jezus', 'jezus christus', 'jezus van nazareth'},\n", " {'basken', 'baskenland', 'baskisch', 'baskische'},\n", " {'fysica', 'fysische', 'natuur-', 'natuurkunde'},\n", " {'tibet', 'tibetaans', 'tibetaanse', 'tibetanen'},\n", " {'slowaaks', 'slowaakse', 'slowaakse republiek', 'slowakije'},\n", " {'koninkrijk pruisen', 'pruisen', 'pruisisch', 'pruisische'},\n", " {'venetiaans', 'venetiaanse', 'venetianen', 'venetië'},\n", " {'macedonisch', 'macedonische', 'macedonië', 'noord-macedonië'},\n", " {'baan', 'baanwielrennen', 'baanwielrenner', 'scratch'},\n", " {'rode leger', 'sovjet', 'sovjets', 'sovjettroepen'},\n", " {'comedy', 'komedie', 'komisch', 'komische'},\n", " {'sloveens', 'sloveense', 'slovenen', 'slovenië'},\n", " {'oekraïens', 'oekraïense', 'oekraïne', 'ukr'},\n", " {'game', 'gamer', 'gaming', 'spel'},\n", " {'schotland', 'schots', 'schotse', 'schotten'},\n", " {'bretagne', 'bretoense', 'bretons', 'bretonse'},\n", " {'metro', 'metrolijn', 'metrolijnen', 'u-bahn'},\n", " {'oostenrijk', 'oostenrijkers', 'oostenrijks', 'oostenrijkse'},\n", " {'kroaat', 'kroaten', 'kroatisch', 'kroatische'},\n", " {'egypte', 'egyptenaren', 'egyptisch', 'egyptische'},\n", " {'gitaar', 'gitarist', 'leadgitaar', 'slaggitaar'},\n", " {'eerste klasse', 'eerste klasse a', 'eersteklasser', 'jupiler league'},\n", " {'jazz', 'jazzpianist', 'pianist', 'pianiste'},\n", " {'tsjechen', 'tsjechisch', 'tsjechische', 'tsjechië'},\n", " {'roemeens', 'roemeense', 'roemenen', 'roemenië'},\n", " {'ch', 'sui', 'zwitserland', 'zwitsers'},\n", " {'portugal', 'portugees', 'portugese', 'portugezen'},\n", " {'bra', 'braziliaans', 'braziliaanse', 'brazilië'},\n", " {'groothertogdom luxemburg', 'luxemburg', 'luxemburgs', 'luxemburgse'},\n", " {'polen', 'pool', 'pools', 'poolse'},\n", " {'aus', 'australisch', 'australische', 'australië'},\n", " {'china', 'chinees', 'chinese', 'chinezen'},\n", " {'fr', 'frans', 'franstalig', 'franstalige'},\n", " {'gevaccineerd', 'vaccin', 'vaccinatie'},\n", " {'gp van zwitserland', 'grand prix van zwitserland', 'zwi'},\n", " {'betovering', 'magische', 'toveren'},\n", " {'wereldkampioenschap voetbal onder 20', 'wk onder 20', 'wk onder-20'},\n", " {'kegel', 'kegels', 'taps'},\n", " {'burgerlijk', 'civiel recht', 'civiele procedure'},\n", " {'duitse keizer', 'keizerlijke', 'rooms-duits'},\n", " {'hyacint', 'hyacinten', 'hyacinthus'},\n", " {'creolen', 'creool', 'creoolse'},\n", " {'nations league', 'uefa nations league', 'uefa nations league 2020/21'},\n", " {'bok', 'geit', 'geiten'},\n", " {'comedyserie', 'komedieserie', 'sitcom'},\n", " {'container', 'containers', 'containerterminal'},\n", " {'jaren 80', 'jaren tachtig', 'tachtig'},\n", " {'jaren 70', 'jaren zeventig', 'zeventig'},\n", " {'bisschop', 'bisschopswijding', 'gewijd'},\n", " {'profetie', 'voorspelling', 'voorspellingen'},\n", " {'schieten', 'schietsport', 'schutter'},\n", " {'gesteriliseerd', 'sterilisatie', 'steriliseren'},\n", " {'procyon', 'wasbeer', 'wasberen'},\n", " {'bora-argon 18', 'bora-hansgrohe', 'team netapp-endura'},\n", " {'bourgeois', 'burger', 'burgers'},\n", " {'minimalisme', 'minimalistisch', 'minimalistische'},\n", " {'neutraal', 'neutrale', 'neutraliteit'},\n", " {'topsport vlaanderen',\n", " 'topsport vlaanderen-baloise',\n", " 'topsport vlaanderen-mercator'},\n", " {'bisdom luik', 'bisschop van luik', 'luiks'},\n", " {'cofidis', 'cofidis, le crédit en ligne', 'cofidis, solutions crédits'},\n", " {'discipline', 'disciplines', 'vakgebied'},\n", " {'fiets', 'fietsen', 'fietsers'},\n", " {'geschiedkundige', 'historica', 'historicus'},\n", " {'rijtuig', 'rijtuigen', 'wagons'},\n", " {'novo nordisk', 'team novo nordisk', 'team type 1-sanofi'},\n", " {'cinema', 'cinematograaf', 'videos'},\n", " {'ag2r la mondiale', 'ag2r-citroën', 'ag2r-la mondiale'},\n", " {'jaren 90', 'jaren negentig', 'negentig'},\n", " {'gestucte', 'pleister', 'stucwerk'},\n", " {'ek
allround', 'ek
allround', 'europese titel'},\n", " {'geschiedenis', 'historisch', 'historische'},\n", " {'gestalkt', 'stalken', 'stalker'},\n", " {'zout water', 'zoutwater', 'zoutwatervis'},\n", " {'bos', 'bossen', 'woud'},\n", " {'dnipro', 'dnipropetrovsk', 'dnjepropetrovsk'},\n", " {'historiserende', 'historiserende stijl', 'historisme'},\n", " {'tweezaadlobbig', 'tweezaadlobbige', 'tweezaadlobbige planten'},\n", " {'visigoten', 'visigotisch', 'visigotische'},\n", " {'agrarische', 'agrarische sector', 'landbouw'},\n", " {'continu', 'continue', 'continuïteit'},\n", " {'discus', 'discuswerpen', 'discuswerper'},\n", " {'bastaard', 'bastaarden', 'hybridisatie'},\n", " {'sanoma', 'sanoma media', 'sanoma uitgevers'},\n", " {'amsterdam sloterdijk', 'sloterdijk', 'station sloterdijk'},\n", " {'districtsraad', 'districtsraden', 'districtsvoorzitter'},\n", " {'mediterraan', 'mediterrane', 'middellandse zeegebied'},\n", " {'polytechnische school', 'technische hogeschool', 'technische universiteit'},\n", " {'transcendent', 'transcendente', 'transcendentie'},\n", " {'arkéa-samsic', 'bretagne-séché environnement', 'fortuneo-vital concept'},\n", " {'huis van oranje', 'oranje-nassau', 'oranjes'},\n", " {'postmoderne', 'postmodernisme', 'postmodernistische'},\n", " {'ampère', 'stroom', 'stroomsterkte'},\n", " {'antwerp', 'antwerp fc', 'royal antwerp'},\n", " {'golfoorlog', 'oorlog in irak', 'tweede golfoorlog'},\n", " {'muziekpedagoog', 'pedagoge', 'pedagoog'},\n", " {'ramen', 'venster', 'vensters'},\n", " {'moraliteit', 'moreel', 'morele'},\n", " {'resistent', 'resistente', 'resistentie'},\n", " {'geologisch tijdvak', 'geologische geschiedenis', 'tijdperk'},\n", " {'olympische spelen 1984',\n", " 'olympische spelen van los angeles',\n", " 'olympische zomerspelen 1984'},\n", " {'archeologische vindplaats', 'site', 'sites'},\n", " {'kg', 'kilo', 'µg'},\n", " {'spoorlijn', 'spoorweg', 'spoorwegen'},\n", " {'viking', 'vikings', 'vikingtijd'},\n", " {'aziatisch', 'aziatische', 'azië'},\n", " {'bahrain mclaren', 'bahrain-merida', 'bahrain-victorious'},\n", " {'twente', 'twents', 'twentse'},\n", " {'zuid-nederland', 'zuid-nederlands', 'zuidelijke nederlanden'},\n", " {'mannelijk', 'mannelijke', '♂'},\n", " {'modernisme', 'modernistische', 'moderniteit'},\n", " {'sociaal', 'sociale', 'sociale wetenschappen'},\n", " {'criticus', 'kritisch', 'muziekcriticus'},\n", " {'expressionisme', 'expressionistische', 'expressionistische stijl'},\n", " {'gaskamer', 'vergassing', 'vergast'},\n", " {'lombarden', 'longobarden', 'longobardische'},\n", " {'abstract', 'abstracte', 'abstractie'},\n", " {'cult', 'cultstatus', 'scene'},\n", " {'minister van volksgezondheid',\n", " 'ministerie van volksgezondheid',\n", " 'volksgezondheid'},\n", " {'arenberg', 'hertog van arenberg', 'hertogdom arenberg'},\n", " {'bekerwinnaar sovjet-unie', 'bekerwinnaar van de sovjet-unie', 'ussr cup'},\n", " {'belgisch voetbalelftal', 'belgische nationale ploeg', 'nationale ploeg'},\n", " {'doodstraf', 'executie', 'geëxecuteerd'},\n", " {'holocaust', 'jodenvervolging', 'shoah'},\n", " {'motorrijtuig', 'motorwagen', 'motorwagens'},\n", " {'whig', 'whig party', 'whigs'},\n", " {'ca', 'california', 'californië'},\n", " {'icon', 'iconen', 'icoon'},\n", " {'feldwebel', 'sergeant', 'unteroffizier'},\n", " {'scenarios', 'script', 'scripts'},\n", " {'act', 'akte', 'akten'},\n", " {'editor', 'filmeditor', 'filmmonteur'},\n", " {'iris', 'irissen', 'lis'},\n", " {'sirene', 'sirenen', 'sirenes'},\n", " {'longslak', 'slak', 'slakken'},\n", " {'dualisme', 'dualistisch', 'dualistische'},\n", " {'breedte', 'nb', 'noorderbreedte'},\n", " {'academy award', 'academy awards', 'oscar'},\n", " {'blind', 'blinde', 'blinden'},\n", " {'cover', 'coverband', 'covers'},\n", " {'dode', 'dood', 'stoffelijk overschot'},\n", " {'10.000 m', '10.000 meter', '10000 m'},\n", " {'dame', 'dames', 'vrouw'},\n", " {'labour', 'labour party', 'labour-partij'},\n", " {'miniaturen', 'miniaturist', 'miniatuur'},\n", " {'isaan', 'noordoost', 'noordoosten'},\n", " {'serie', 'series', 'televisieserie'},\n", " {'billboard', 'billboards', 'de amerikaanse hitlijst'},\n", " {'beperking', 'handicap', 'mindervaliden'},\n", " {'saami', 'samen', 'sami'},\n", " {'akoestiek', 'akoestisch', 'akoestische'},\n", " {'science fiction', 'sciencefiction', 'sf'},\n", " {'strategie', 'strategisch', 'strategische'},\n", " {'piano', 'pianoforte', 'pianoles'},\n", " {'de kempen', 'kempen', 'kempens'},\n", " {'leningrad', 'sint-petersburg', 'st. petersburg'},\n", " {'somalisch', 'somalische', 'somalië'},\n", " {'kat', 'kater', 'katten'},\n", " {'indo-europeanen', 'indo-europees', 'indo-europese'},\n", " {'gothenburg', 'göteborg', 'göteborg c'},\n", " {'nederzetting', 'plaats', 'woonplaats'},\n", " {'beker van rusland', 'bekerwinnaar rusland', 'bekerwinnaar van rusland'},\n", " {'drone', 'drones', 'uav'},\n", " {'jong vitesse', 'sbv vitesse', 'vitesse'},\n", " {'opstand', 'rebel', 'rebellen'},\n", " {'caraïbische', 'cariben', 'caribische'},\n", " {'molukken', 'molukkers', 'molukse'},\n", " {'tantra', 'tantras', 'tantrische'},\n", " {'grootloge', 'obediëntie', 'vrijmetselaarsloge'},\n", " {'conservatief', 'conservatieve', 'conservatieven'},\n", " {'leen', 'leengoed', 'lenen'},\n", " {'fellow', 'leden', 'lid'},\n", " {'im', 'internationaal meester', 'meester'},\n", " {'jaren 10', 'jaren tien', 'tien'},\n", " {'bus', 'buslijn', 'busvervoer'},\n", " {'economische zaken',\n", " 'minister van economische zaken',\n", " 'ministerie van economische zaken'},\n", " {'binnenlandse zaken',\n", " 'minister van binnenlandse zaken',\n", " 'ministerie van binnenlandse zaken'},\n", " {'grote', 'pacific', 'pacifische'},\n", " {'boerderij', 'hoeve', 'hofstede'},\n", " {'europees hof van justitie',\n", " 'hof van justitie',\n", " 'hof van justitie van de europese gemeenschappen'},\n", " {'academie',\n", " 'academie voor schone kunsten',\n", " 'koninklijke academie voor schone kunsten'},\n", " {'promotie', 'promoveerde', 'promoveren'},\n", " {'regent', 'regenten', 'regentes'},\n", " {'1.1', '1.2', 'classificatie'},\n", " {'versnelling', 'versnellingen', 'versnellingsbak'},\n", " {'den', 'dennen', 'dennenboom'},\n", " {'voeding', 'voedingsmiddelen', 'voedsel'},\n", " {'aram', 'aramees', 'aramese'},\n", " {'botanica', 'botanicus', 'plantkundige'},\n", " {'alkali', 'base', 'basen'},\n", " {'bestand', 'bestanden', 'wapenstilstand'},\n", " {'dominicaans', 'dominicaanse', 'dominicaanse republiek'},\n", " {'taranto', 'tarente', 'tarentum'},\n", " {'kristal', 'kristallen', 'kristallijne'},\n", " {'codex', 'codices', 'manuscripten'},\n", " {'naturalisme', 'naturalistisch', 'naturalistische'},\n", " {'wortel', 'wortelen', 'wortels'},\n", " {'onafhankelijk', 'onafhankelijke', 'onafhankelijkheid'},\n", " {'anglicaans', 'anglicaanse', 'anglicaanse kerk'},\n", " {'protestants', 'protestantse', 'protestantse kerk'},\n", " {'duurzaam', 'duurzaamheid', 'duurzame'},\n", " {'leuvense universiteit', 'universiteit leuven', 'universiteit van leuven'},\n", " {'breuk', 'breuken', 'teller'},\n", " {'deel', 'delen', 'deling'},\n", " {'maya', 'mayas', 'mayastad'},\n", " {'orthodox', 'orthodoxe', 'orthodoxe kerk'},\n", " {'antillen', 'antilliaanse', 'nederlandse antillen'},\n", " {'keyboards', 'toetsen', 'toetsenist'},\n", " {'rollenspel', 'rollenspellen', 'rpg'},\n", " {'counties', 'county', 'countys'},\n", " {'jammu en kasjmir', 'kashmir', 'kasjmir'},\n", " {'schutterij', 'schutterijen', 'schuttersgilde'},\n", " {'overeenkomst', 'transactie', 'transacties'},\n", " {'hoer', 'prostituee', 'prostituees'},\n", " {'geloof', 'godsdienst', 'religieuze'},\n", " {'magazine', 'periodiek', 'tijdschrift'},\n", " {'dubbelspion', 'spion', 'spionage'},\n", " {'standard', 'standard de liège', 'standard luik'},\n", " {'leadzanger', 'zanger', 'zangeres'},\n", " {'punk', 'punkband', 'punkbeweging'},\n", " {'varken', 'varkens', 'zwijnen'},\n", " {'reïncarnatie', 'wedergeboorte', 'wedergeboren'},\n", " {'primera b', 'segunda división', 'segunda división a'},\n", " {'cultureel', 'culturen', 'cultuur'},\n", " {'ddr', 'duitse democratische republiek', 'oost-duitsland'},\n", " {'reggio', 'reggio emilia', 'reggio nellemilia'},\n", " {'bangladesh', 'bengaalse', 'bengalen'},\n", " {'chirurgische ingreep', 'operatie', 'operaties'},\n", " {'europees voetbal', 'europese competities', 'toernooien'},\n", " {'maat', 'maatsoort', 'maten'},\n", " {'j', 'jaar', 'jaren'},\n", " {'mongolen', 'mongools', 'mongoolse'},\n", " {'vietnam', 'vietnamees', 'vietnamese'},\n", " {'financiën', 'minister van financiën', 'ministerie van financiën'},\n", " {'open source', 'open-source', 'opensource'},\n", " {'toscaans', 'toscaanse', 'toscane'},\n", " {'matteüs', 'mattheus', 'mattheüs'},\n", " {'morfologie', 'morfologisch', 'morfologische'},\n", " {'strip', 'stripreeks', 'stripserie'},\n", " {'vertaalster', 'vertaler', 'vertaling'},\n", " {'vulkanisch', 'vulkanische', 'vulkanische activiteit'},\n", " {'dogma', 'dogmas', 'dogmatische'},\n", " {'madagaskar', 'malagassisch', 'malagassische'},\n", " {'koningin wilhelmina', 'prinses wilhelmina', 'wilhelmina'},\n", " {'canon', 'canoniek', 'canonieke'},\n", " {'gemeentefusie', 'gemeentelijke herindeling', 'herindeling'},\n", " {'geïmproviseerd', 'improvisatie', 'improviseren'},\n", " {'justitie', 'minister van justitie', 'ministerie van justitie'},\n", " {'eth', 'ethiopisch', 'ethiopië'},\n", " {'motorvermogen', 'vermogen', 'vermogens'},\n", " {'adelaar', 'adelaars', 'arend'},\n", " {'mijn', 'mijnen', 'zeemijn'},\n", " {'landskampioen sovjet-unie',\n", " 'landskampioen van de sovjet-unie',\n", " 'landskampioenschap van de sovjet-unie'},\n", " {'piramide', 'piramiden', 'piramides'},\n", " {'bafta', 'bafta award', 'bafta awards'},\n", " {'friese', 'friesland', 'friezen'},\n", " {'territoria', 'territoriaal', 'territorium'},\n", " {'harmonie', 'harmonieën', 'harmonisch'},\n", " {'sprinten', 'sprinter', 'sprinters'},\n", " {'kolom', 'zuil', 'zuilen'},\n", " {'arrangeerde', 'arrangement', 'arrangeur'},\n", " {'graaf', 'graf', 'graven'},\n", " {'realisme', 'realistisch', 'realistische'},\n", " {'schaap', 'schapen', 'schapenvlees'},\n", " {'communautaire', 'eu', 'europese unie'},\n", " {'maleise', 'maleisisch', 'maleisië'},\n", " {'zwart', 'zwarte', 'zwarten'},\n", " {'hobo', 'hobos', 'hoboïst'},\n", " {'syrisch', 'syrische', 'syrië'},\n", " {'de smurfen', 'smurf', 'smurfen'},\n", " {'bisdom münster', 'bisschop van münster', 'münster'},\n", " {'astronomen', 'astronoom', 'sterrenkundige'},\n", " {'nobelprijs', 'nobelprijswinnaar', 'nobelprijswinnares'},\n", " {'oezbeeks', 'oezbeekse', 'oezbekistan'},\n", " {'bijbel', 'bijbelboek', 'bijbelse'},\n", " {'thema', 'themas', 'thematiek'},\n", " {'lichaam', 'lichamen', 'menselijk lichaam'},\n", " {'staal', 'staalindustrie', 'stalen'},\n", " {'oostfront', 'westelijk front', 'westfront'},\n", " {'geallieerd', 'geallieerde', 'geallieerden'},\n", " {'buitenlandse zaken',\n", " 'minister van buitenlandse zaken',\n", " 'ministerie van buitenlandse zaken'},\n", " {'stads', 'stadsgezicht', 'stadsgezichten'},\n", " {'consul', 'consulaat', 'consuls'},\n", " {'aartsbisdom mainz', 'aartsbisschop van mainz', 'mainz'},\n", " {'handschrift', 'handschriften', 'manuscript'},\n", " {'sint truiden', 'sint-truiden', 'sint-truidense vv'},\n", " {'waalse', 'walen', 'wallonië'},\n", " {'orde', 'orden', 'ordes'},\n", " {'fotomodel', 'model', 'modellen'},\n", " {'hertogdom lotharingen', 'lotharingen', 'lotharingse'},\n", " {'azerbeidzjaans', 'azerbeidzjaanse', 'azerbeidzjan'},\n", " {'moldavisch', 'moldavische', 'moldavië'},\n", " {'oost-vlaams', 'oost-vlaamse', 'oost-vlaanderen'},\n", " {'beharing', 'haar', 'haard'},\n", " {'franken', 'frankisch', 'frankische'},\n", " {'boogschieten', 'boogschutter', 'boogschutters'},\n", " {'germaans', 'germaanse', 'germaanse talen'},\n", " {'producent', 'producenten', 'producer'},\n", " {'trinidad', 'trinidad & tobago', 'trinidad en tobago'},\n", " {'wit-rusland', 'wit-russisch', 'wit-russische'},\n", " {'filipijnen', 'filipijns', 'phi'},\n", " {'kazachs', 'kazachse', 'kazachstan'},\n", " {'franse bezetting', 'franse overheersing', 'franse tijd'},\n", " {'romantiek', 'romantisch', 'romantische'},\n", " {'albanees', 'albanese', 'albanezen'},\n", " {'georgisch', 'georgische', 'georgië'},\n", " {'sur', 'surinaams', 'suriname'},\n", " {'mediolanum', 'milaan', 'milano'},\n", " {'om', 'openbaar ministerie', 'parket'},\n", " {'napels', 'napolitaanse', 'neapolis'},\n", " {'bulgaars', 'bulgaarse', 'bulgaren'},\n", " {'theologe', 'theologen', 'theoloog'},\n", " {'chemie', 'chemische', 'scheikunde'},\n", " {'alessandro farnese', 'hertog van parma', 'parma'},\n", " {'lb', 'lublin', 'pound'},\n", " {'captain', 'kapitein', 'kapiteins'},\n", " {'bouwkundig', 'kunstwerk', 'kunstwerken'},\n", " {'zuid-afrika', 'zuid-afrikaans', 'zuid-afrikaanse'},\n", " {'schepen', 'schepenen', 'vaartuig'},\n", " {'leiden', 'leids', 'leidse'},\n", " {'alcohol', 'alcoholgebruik', 'alcoholische'},\n", " {'goud', 'gouden', 'gouden plaat'},\n", " {'boheems', 'boheemse', 'bohemen'},\n", " {'malta', 'maltees', 'maltese'},\n", " {'cuba', 'cubaan', 'cubaanse'},\n", " {'fluit', 'fluiten', 'fluitist'},\n", " {'iraans', 'iraanse', 'iran'},\n", " {'indonesisch', 'indonesische', 'indonesië'},\n", " {'joegoslavisch', 'joegoslavische', 'joegoslavië'},\n", " {'schimmel', 'schimmels', 'schimmelziekte'},\n", " {'saksen', 'saksisch', 'saksische'},\n", " {'ijsland', 'ijslands', 'ijslandse'},\n", " {'rhône', 'rhônedal', 'rhônevallei'},\n", " {'zeeland', 'zeeuwen', 'zeeuwse'},\n", " {'plp', 'prl', 'pvv'},\n", " {'liberaal', 'liberale', 'liberalen'},\n", " {'tur', 'turkije', 'turks voetbalelftal'},\n", " {'drum', 'drums', 'slagwerk'},\n", " {'senaat', 'senator', 'senatoren'},\n", " {'drama', 'dramafilm', 'dramaserie'},\n", " {'atheense', 'athene', 'atheners'},\n", " {'syn.', 'synoniem', 'synoniemen'},\n", " {'tweede klasse', 'tweede niveau', 'tweedeklasser'},\n", " {'brussel', 'brussels', 'brusselse'},\n", " {'holland', 'hollands', 'hollandse'},\n", " {'liège', 'luik', 'luikse'},\n", " {'antwerpen', 'antwerps', 'antwerpse'},\n", " {'new york', 'new york city', 'ny'},\n", " {'utrecht', 'utrechts', 'utrechtse'},\n", " {'en', 'engels', 'engelstalige'},\n", " {'groningen', 'groninger', 'groningse'},\n", " {'halve finale', 'halve finales'},\n", " {'nk sprint', 'nk
sprint'},\n", " {'jan pronk', 'pronk'},\n", " {'sonnet', 'sonnetten'},\n", " {'groeve', 'steengroeve'},\n", " {'corolla', 'toyota corolla'},\n", " {'versterker', 'versterkers'},\n", " {'reproductie', 'voortplanting'},\n", " {'aspect', 'aspecten'},\n", " {'amir', 'emir'},\n", " {'erf', 'erven'},\n", " {'ernst lodewijk', 'ernst lodewijk van hessen-darmstadt'},\n", " {'gemeentearchief', 'stadsarchief'},\n", " {'he', 'helium'},\n", " {'jaren 60', 'jaren zestig'},\n", " {'julia caesaris', 'julia caesaris maior'},\n", " {'natie', 'nationale'},\n", " {'lood', 'pb'},\n", " {'lezing', 'presentatie'},\n", " {'prince of wales', 'prins van wales'},\n", " {'beatrix', 'prinses beatrix'},\n", " {'quezon', 'tayabas'},\n", " {'commissariaten', 'rvc'},\n", " {'eredivisie 2011/12', 'seizoen 2011/12'},\n", " {'strijker', 'strijkers'},\n", " {'tl', 'vmbo-t'},\n", " {'pdc world darts championship', 'world darts championship'},\n", " {'algonkin', 'algonquin'},\n", " {'athelstan', 'æthelstan'},\n", " {'bridge', 'bridger'},\n", " {'british open', 'brits open'},\n", " {'caja rural', 'caja rural-seguros rga'},\n", " {'dwangarbeid', 'dwangarbeiders'},\n", " {'figurine', 'figurines'},\n", " {'gate', 'gates'},\n", " {'gedenkplaat', 'gedenksteen'},\n", " {'hond', 'honden'},\n", " {'comedian', 'humorist'},\n", " {'inhoud', 'volume'},\n", " {'israëlisch-palestijns conflict', 'israëlisch-palestijnse conflict'},\n", " {'gould', 'john gould'},\n", " {'landskampioen italië', 'lega basket serie a'},\n", " {'grote of sint-martinuskerk', 'martinuskerk'},\n", " {'michael schumacher', 'schumacher'},\n", " {'mozaïek', 'mozaïeken'},\n", " {'naald', 'naalden'},\n", " {'michajlov', 'nikolaj michajlov'},\n", " {'nk allround', 'nk
allround'},\n", " {'ode', 'oden'},\n", " {'olie-', 'oliemolen'},\n", " {'palestra itália', 'palmeiras'},\n", " {'grodziski', 'powiat grodziski'},\n", " {'recitatie', 'reciteren'},\n", " {'regenwoud', 'regenwouden'},\n", " {'revisionisme', 'revisionistische'},\n", " {'sagan', 'żagań'},\n", " {'saint martin', 'saint-martin'},\n", " {'ambrosius', 'santambrogio'},\n", " {'eredivisie 2008/09', 'seizoen 2008/09'},\n", " {'seoel', 'seoul'},\n", " {'tetraëder', 'tetraëdrisch'},\n", " {'hr.ms. tromp', 'tromp'},\n", " {'t-rex', 'tyrannosaurus rex'},\n", " {'conference league', 'uefa europa conference league'},\n", " {'nippo-vini fantini', 'vini fantini nippo'},\n", " {'wake', 'wake island'},\n", " {'algemeen directeur', 'ceo'},\n", " {'apocrief', 'apocriefe'},\n", " {'belijdenis', 'geloofsbelijdenis'},\n", " {'cliché', 'clichés'},\n", " {'dol', 'dol-de-bretagne'},\n", " {'dvd', 'dvd-speler'},\n", " {'egel', 'egels'},\n", " {'efeze', 'ephesus'},\n", " {'arbela', 'erbil'},\n", " {'exarch', 'exarchaat'},\n", " {'expeditie', 'expedities'},\n", " {'futen', 'fuut'},\n", " {'gomel', 'homel'},\n", " {'huisprelaat', 'pauselijk huisprelaat'},\n", " {'inr', 'nir'},\n", " {'gekroond', 'kroning'},\n", " {'lage vloer', 'lagevloer'},\n", " {'emanuel', 'manuel i'},\n", " {'maria amalia', 'maria amalia van oostenrijk'},\n", " {'nexus', 'the nexus'},\n", " {'onderwijs', 'vorming'},\n", " {'parachute', 'parachutisten'},\n", " {'pragmatisch', 'pragmatische'},\n", " {'radiostraling', 'rf'},\n", " {'diagram', 'schema'},\n", " {'sector', 'sectoren'},\n", " {'st albans', 'st. albans'},\n", " {'saint paul', 'st. paul'},\n", " {'the lord of the rings: the return of the king', 'the return of the king'},\n", " {'tijl uilenspiegel', 'uilenspiegel'},\n", " {'verdrag van versailles', 'vrede van versailles'},\n", " {'wk 1998', 'wk in frankrijk'},\n", " {'autonomie', 'zelfstandig'},\n", " {'augustijn', 'augustijnenklooster'},\n", " {'bas jacobs', 'jacobs'},\n", " {'edingen', 'enghien'},\n", " {'els', 'elzen'},\n", " {'everhard iii', 'everhard iii van württemberg'},\n", " {'famagusta', 'gazimağusa'},\n", " {'assemblée nationale', 'franse parlement'},\n", " {'futurisme', 'futuristen'},\n", " {'gregoriaans', 'gregoriaanse'},\n", " {'gehoor', 'horen'},\n", " {'benji', 'ji'},\n", " {'kolberg', 'kołobrzeg'},\n", " {'lateraal', 'laterale'},\n", " {'derry', 'londonderry'},\n", " {'deutschland', 'lützow'},\n", " {'maximum', 'minimum'},\n", " {'olifant', 'olifanten'},\n", " {'orlando city', 'orlando city sc'},\n", " {'palma', 'palma de mallorca'},\n", " {'pcc-car', 'pcc-cars'},\n", " {'pedro', 'pedro rodríguez'},\n", " {'pk', 'pki'},\n", " {'plesiosauriër', 'plesiosauriërs'},\n", " {'racing club gent', 'racing gent'},\n", " {'refuge', 'refugium'},\n", " {'robben', 'zeehonden'},\n", " {'sociaal netwerk', 'sociale netwerken'},\n", " {'sonnenburg', 'słońsk'},\n", " {'st helens', 'st. helens'},\n", " {'stabilisator', 'stabilisatoren'},\n", " {'stockholm', 'stockholm c'},\n", " {'topschutter', 'topscorer'},\n", " {'tram 4', 'tramlijn 4'},\n", " {'lijn 9', 'tramlijn 9'},\n", " {'trebnitz', 'trzebnica'},\n", " {'west-friesland', 'west-friezen'},\n", " {'alg', 'wieren'},\n", " {'salix', 'wilgen'},\n", " {'12 angry men', 'wel'},\n", " {'49er', '49erfx'},\n", " {'aartshertog van oostenrijk', 'aartshertogin van oostenrijk'},\n", " {'afrika cup', 'afrikaans kampioenschap'},\n", " {'alva', 'hertog van alva'},\n", " {'asiel', 'asielprocedure'},\n", " {'bogota', 'bogotá'},\n", " {'cheeta', 'cheetah'},\n", " {'copulatie', 'paren'},\n", " {'dynamiek', 'dynamisch'},\n", " {'ek onder 21', 'europees kampioenschap voetbal onder 21'},\n", " {'elvis', 'elvis presley'},\n", " {'eredivisionist', 'nederlandse eredivisie'},\n", " {'conserveren', 'geconserveerd'},\n", " {'geiser', 'geisers'},\n", " {'golden globe award', 'golden globes'},\n", " {'gudmundsson', 'guðmundsson'},\n", " {'chip', 'ics'},\n", " {'infiltratie', 'infiltreren'},\n", " {'internacional', 'sc internacional'},\n", " {'dag des oordeels', 'laatste oordeel'},\n", " {'laodicea', 'latakia'},\n", " {'libel', 'libellen'},\n", " {'lijn 12', 'tramlijn 12'},\n", " {'lijn 17', 'tramlijn 17'},\n", " {'louise marie', 'louise van orléans'},\n", " {'lemberg', 'lviv'},\n", " {'kinshasa', 'léopoldville'},\n", " {'lourdesgrot', 'mariagrot'},\n", " {'matrijs', 'matrijzen'},\n", " {'geometrie', 'meetkunde'},\n", " {'mesopotamische', 'mesopotamië'},\n", " {'landbouw, natuur en voedselkwaliteit',\n", " 'ministerie van landbouw en visserij'},\n", " {'moezel', 'mosel'},\n", " {'most valuable player', 'mvp'},\n", " {'datsun', 'nissan'},\n", " {'ontginning', 'ontgonnen'},\n", " {'oortje', 'oortjes'},\n", " {'paardenstaart', 'paardenstaarten'},\n", " {'alexander van oranje-nassau', 'prins alexander'},\n", " {'racing', 'racing club'},\n", " {'reflectie', 'reflector'},\n", " {'ladies tour of norway', 'ronde van noorwegen'},\n", " {'sterk', 'sterke'},\n", " {'soemba', 'sumba'},\n", " {'stazione termini', 'termini'},\n", " {'spoortunnel', 'tunnels'},\n", " {'verbrand', 'verbranding'},\n", " {'vierkant', 'vierkante'},\n", " {'waaier', 'waaiers'},\n", " {'wereldkampioenschappen in 2003', 'wk in parijs'},\n", " {'adelheid', 'adelheid van bourgondië'},\n", " {'alia', 'aliyah'},\n", " {'alternatieve', 'alternative'},\n", " {'de gelaarsde kat', 'gelaarsde kat'},\n", " {'de rode ridder', 'rode ridder'},\n", " {'de waarheid', 'volksdagblad'},\n", " {'derde persoon', 'eerste persoon'},\n", " {'beëdigd', 'eed'},\n", " {'erkend', 'erkenning'},\n", " {'etage', 'etages'},\n", " {'fortuna sittard', 'fsc'},\n", " {'bouwwerk', 'gebouw'},\n", " {'kalken', 'kalksteen'},\n", " {'kan', 'khan'},\n", " {'oculi', 'oculus'},\n", " {'philippus', 'philippus ii'},\n", " {'domproost', 'proosten'},\n", " {'francs borains', 'r. francs borains'},\n", " {'ritueel', 'rituelen'},\n", " {'rodelbaan', 'rodelen'},\n", " {'röntgen', 'röntgenstraling'},\n", " {'elisabeth van hongarije', 'sint-elisabeth'},\n", " {'stargard', 'stargard szczeciński'},\n", " {'stekelvarken', 'stekelvarkens'},\n", " {'moyland', 'till'},\n", " {'moment', 'torsie'},\n", " {'treptow', 'trzebiatów'},\n", " {'constellation', 'uss constellation'},\n", " {'vleet', 'want'},\n", " {'nesten', 'vogelnest'},\n", " {'voorstad', 'voorsteden'},\n", " {'waarneming', 'waarnemingen'},\n", " {'gereconstrueerd', 'wederopbouw'},\n", " {'jongerenklassement', 'witte trui'},\n", " {'zwak', 'zwakke'},\n", " {'gp van zweden', 'zwe'},\n", " {'aland', 'kurt'},\n", " {'boezem', 'boezems'},\n", " {'ccc polsat polkowice', 'ccc sprandi polkowice'},\n", " {'chp', 'republikeinse volkspartij'},\n", " {'correlatie', 'gecorreleerd'},\n", " {'cynische', 'cynisme'},\n", " {'dans', 'dansen'},\n", " {'devon', 'devonshire'},\n", " {'frederik willem', 'frederik willem i'},\n", " {'galba', 'servius sulpicius galba'},\n", " {'garibaldi', 'giuseppe garibaldi'},\n", " {'haring', 'haringen'},\n", " {'ambachtsheerlijkheid', 'heerlijkheid'},\n", " {'kers', 'kersen'},\n", " {'kiesdistrict', 'kiesdistricten'},\n", " {'le hom', 'thury-harcourt'},\n", " {'les villages vovéens', 'voves'},\n", " {'luchtvrachtmaatschappij', 'vrachtluchtvaartmaatschappij'},\n", " {'magnus eriksson', 'magnus ii'},\n", " {'easy listening', 'middle of the road'},\n", " {'mo', 'moguls'},\n", " {'noordelijke', 'verenigde nederlanden'},\n", " {'pd', 'pdl'},\n", " {'pga championship', 'pga kampioenschap'},\n", " {'gazprom-rusvelo', 'rusvelo'},\n", " {'scanner', 'scanners'},\n", " {'publius cornelius scipio', 'scipio'},\n", " {'shogun', 'shogunaat'},\n", " {'stand', 'standen'},\n", " {'soeur sourire', 'sœur sourire'},\n", " {'tactiek', 'tactische'},\n", " {'tataarse', 'tataren'},\n", " {'baroda', 'vadodara'},\n", " {'vendel', 'vendels'},\n", " {'villedieu-les-poêles', 'villedieu-les-poêles-rouffigny'},\n", " {'afrikaans', 'afrikaanse'},\n", " {'armada', 'spaanse armada'},\n", " {'b&b', 'bed & breakfast'},\n", " {'beursgenoteerd', 'beursgenoteerde'},\n", " {'elbing', 'elbląg'},\n", " {'emmerich', 'emmerik'},\n", " {'europees kampioenschap voetbal', 'europees kampioenschap voetbal 2016'},\n", " {'evangelische', 'evangelische kerk'},\n", " {'fan', 'fans'},\n", " {'fetisj', 'fetisjisme'},\n", " {'girne', 'kyrenia'},\n", " {'gracht', 'slotgracht'},\n", " {'graden', '°'},\n", " {'kwal', 'kwallen'},\n", " {'glatz', 'kłodzko'},\n", " {'l4', 'l6'},\n", " {'lourenço marques', 'maputo'},\n", " {'krijgskunst', 'martial arts'},\n", " {'minister van onderwijs, kunsten en wetenschappen',\n", " 'ministerie van onderwijs, kunsten en wetenschappen'},\n", " {'moord', 'vermoord'},\n", " {'myceense', 'myceners'},\n", " {'nowodworski', 'powiat nowodworski'},\n", " {'prins', 'prinses'},\n", " {'prins-bisschop', 'prinsbisdom'},\n", " {'radon', 'rn'},\n", " {'rui', 'ruien'},\n", " {'private', 'soldaten'},\n", " {'speerwerpen', 'speerwerper'},\n", " {'alpaca', 'alpacas'},\n", " {'angst', 'vrees'},\n", " {'bezirk', 'bezirke'},\n", " {'bielski', 'powiat bielski'},\n", " {'braam', 'bramen'},\n", " {'chiraal', 'chirale'},\n", " {'constante', 'constanten'},\n", " {'de krim', 'krim'},\n", " {'dialogen', 'dialoog'},\n", " {'escort', 'ford escort'},\n", " {'galerij', 'galerijen'},\n", " {'gebeurtenis', 'gebeurtenissen'},\n", " {'holstein-gottorp', 'sleeswijk-holstein-gottorp'},\n", " {'infrarood', 'ir'},\n", " {'john locke', 'locke'},\n", " {'melaatsen', 'melaatsheid'},\n", " {'melville', 'melville-eiland'},\n", " {'microprocessor', 'processor'},\n", " {'neil young', 'young'},\n", " {'olympische spelen in barcelona', 'olympische zomerspelen 1992'},\n", " {'opolski', 'powiat opolski'},\n", " {'pola', 'pula'},\n", " {'rondeel', 'rondelen'},\n", " {'sint-helena', 'st. helena'},\n", " {'saur-sojasun', 'sojasun'},\n", " {'radioshack', 'team radioshack'},\n", " {'tram 2', 'tramlijn 2'},\n", " {'lijn 25', 'tramlijn 25'},\n", " {'verpleegkunde', 'verpleegkundige'},\n", " {'ver', 'verspringen'},\n", " {'vrienden', 'vriendschap'},\n", " {'chan', 'zen'},\n", " {'aanbidding der wijzen', 'drie koningen'},\n", " {'anatomie', 'anatoom'},\n", " {'bekerwedstrijd', 'copa del rey'},\n", " {'brzeski', 'powiat brzeski'},\n", " {'digitaal', 'digitale'},\n", " {'europese parlementsverkiezingen', 'europese verkiezingen'},\n", " {'export', 'uitvoer'},\n", " {'altona', 'hamburg-altona'},\n", " {'hervormde', 'hervormde kerk'},\n", " {'herzogenrath', 's-hertogenrade'},\n", " {'indologie', 'indoloog'},\n", " {'iason', 'jason'},\n", " {'javaans', 'kawi'},\n", " {'geodeet', 'landmeter'},\n", " {'leenman', 'leenmannen'},\n", " {'aunay-sur-odon', 'les monts daunay'},\n", " {'lucia', 'santa lucia'},\n", " {'evros', 'maritsa'},\n", " {'mis', 'missen'},\n", " {'nk afstanden', 'nk
afstanden'},\n", " {'onthoofd', 'onthoofding'},\n", " {'sint-servaasbasiliek', 'sint-servaaskerk'},\n", " {'spongebob', 'spongebob squarepants'},\n", " {'tand', 'tanden'},\n", " {'tekenaar', 'tekenen'},\n", " {'theaterschool', 'toneelschool'},\n", " {'tiberius', 'tiberius claudius nero'},\n", " {'anders', 'trappist'},\n", " {'liga', 'voetbalcompetitie'},\n", " {'weerwolf', 'weerwolven'},\n", " {'grünberg', 'zielona góra'},\n", " {'middengewicht', 'zwaargewicht'},\n", " {'alexandrijnse', 'alexandrië'},\n", " {'bonen', 'boon'},\n", " {'congruent', 'congruentie'},\n", " {'die grünen', 'groenen'},\n", " {'axis', 'draaier'},\n", " {'ecologisch', 'ecologische'},\n", " {'ellips', 'elliptische'},\n", " {'fabrikant', 'industrieel'},\n", " {'foto', 'fotos'},\n", " {'glaciale', 'ijstijden'},\n", " {'handvest', 'manifest'},\n", " {'leeg', 'leegte'},\n", " {'olympische spelen van 1968', 'olympische zomerspelen 1968'},\n", " {'roest', 'roesten'},\n", " {'ruprecht', 'ruprecht van de palts'},\n", " {'symmetrie', 'symmetrisch'},\n", " {'trivia', 'triviaal'},\n", " {'hallescher fc wacker', 'wacker halle'},\n", " {'woord', 'woorden'},\n", " {'3000 m steeplechase', 'steeplechase'},\n", " {'accommodatie', 'accommoderen'},\n", " {'alpecin-fenix', 'corendon-circus'},\n", " {'anale', 'anus'},\n", " {'apartheid', 'apartheidsregime'},\n", " {'boeddhisme', 'boeddhistische'},\n", " {'boei', 'boeien'},\n", " {'brantôme', 'brantôme en périgord'},\n", " {'certificaat', 'certificaten'},\n", " {'de zeven provinciën', 'hr.ms. de zeven provinciën'},\n", " {'boekdrukker', 'drukker'},\n", " {'jacob', 'jakob'},\n", " {'katholieke universiteit', 'ru'},\n", " {'kea', 'keos'},\n", " {'la', 'los angeles'},\n", " {'middeleeuwen', 'middeleeuwse'},\n", " {'minister van onderwijs', 'ministerie van onderwijs'},\n", " {'moderne', 'moderne kunst'},\n", " {'natuurwet', 'natuurwetten'},\n", " {'sneeuwwitje', 'sneeuwwitje en de zeven dwergen'},\n", " {'magiër', 'tovenaar'},\n", " {'tsaar', 'tsaristische'},\n", " {'tuig', 'tuigage'},\n", " {'unitarisme', 'unitariërs'},\n", " {'verontreiniging', 'vervuiling'},\n", " {'licht', 'zichtbaar licht'},\n", " {'het zwin', 'zwin'},\n", " {'aasgarnalen', 'aasgarnalensoort'},\n", " {'brahmana', 'brahmanas'},\n", " {'categorie', 'categorieën'},\n", " {'doop', 'gedoopt'},\n", " {'etsen', 'etser'},\n", " {'gang', 'gangen'},\n", " {'gisting', 'vergisting'},\n", " {'goederenvervoer', 'transport'},\n", " {'gondel', 'gondels'},\n", " {'hydraulisch', 'hydraulische'},\n", " {'geïnterneerd', 'internering'},\n", " {'koninginnedag', 'koningsdag'},\n", " {'opéra comique', 'opéra-comique'},\n", " {'lomonosov', 'oranienbaum'},\n", " {'pees', 'pezen'},\n", " {'pot', 'potten'},\n", " {'pylonen', 'pyloon'},\n", " {'stadia', 'stadium'},\n", " {'gooi', 't gooi'},\n", " {'urinoir', 'urinoirs'},\n", " {'fc wacker innsbruck', 'wacker innsbruck'},\n", " {'achtergrond', 'achtergrondzang'},\n", " {'belgisch kampioen', 'belgisch kampioenschap'},\n", " {'chopper', 'choppers'},\n", " {'constitutie', 'constitutionele'},\n", " {'episch', 'epische'},\n", " {'first-person', 'fps'},\n", " {'hortus', 'hortus botanicus'},\n", " {'jan verheyen', 'verheyen'},\n", " {'landskampioenschap van rusland', 'russische superliga'},\n", " {'lijn 6', 'tramlijn 6'},\n", " {'locomotief', 'locomotieven'},\n", " {'logica', 'logische'},\n", " {'oceanische', 'oceanië'},\n", " ...]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted(clusters, key=len)[::-1]" ] }, { "cell_type": "code", "execution_count": null, "id": "75340f6b-359f-412c-a8c4-aa32e42d2a67", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }