Created
February 10, 2022 11:43
-
-
Save avidale/70045403f6b5b678974956641c86f659 to your computer and use it in GitHub Desktop.
conceptnet5_russified.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 507, | |
"id": "f1018ba7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from datasets import load_dataset\n", | |
"dataset = load_dataset(\"conceptnet5\", \"conceptnet5\", streaming=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5e7392f4", | |
"metadata": {}, | |
"source": [ | |
"Напрямую между русскими понятиями интересных отношений - очень немного. \n", | |
"\n", | |
"Но можно для каждого интересного отношения между нерусскими понятиями найти русские синонимы, и таким образом собрать русский concept-net. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 148, | |
"id": "1f7b0c6b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from tqdm.auto import tqdm, trange\n", | |
"from collections import Counter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 150, | |
"id": "42d9093c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "7a0616f04240429ea14a4890eebdf0ea", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"0it [00:00, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"cn_ru = []\n", | |
"\n", | |
"tq = tqdm(dataset['train'])\n", | |
"for i, item in enumerate(tq):\n", | |
" if i % 1000 == 0:\n", | |
" tq.set_description(f'{len(cn_ru)} / {i} / {item[\"lang\"]}')\n", | |
" if item['lang'] and 'ru' in item['lang']:\n", | |
" cn_ru.append(item)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 152, | |
"id": "962bb06f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('ru', 480208),\n", | |
" ('ru/en.wiktionary.org', 374041),\n", | |
" ('ru/fr.wiktionary.org', 175954),\n", | |
" ('ru/en', 158924),\n", | |
" ('en/ru', 71414),\n", | |
" ('ru/ru.dbpedia.org', 32302),\n", | |
" ('ru/fr', 22531),\n", | |
" ('ja/ru', 13107),\n", | |
" ('de/ru', 12705),\n", | |
" ('fr/ru', 9553),\n", | |
" ('rup', 6654),\n", | |
" ('mul/ru', 5933),\n", | |
" ('rup/en', 5029),\n", | |
" ('rup/en.wiktionary.org', 2758),\n", | |
" ('rup/la', 2426),\n", | |
" ('rup/ro', 2085),\n", | |
" ('en/rup', 1886),\n", | |
" ('ru/de', 1683),\n", | |
" ('ru/orv', 1069),\n", | |
" ('ru/la', 925)]" | |
] | |
}, | |
"execution_count": 152, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Counter(item['lang'] for item in cn_ru).most_common(20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 153, | |
"id": "111139f5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import random" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 502, | |
"id": "39aa1b05", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'sentence': '',\n", | |
" 'extra_info': '{\"dataset\": \"/d/wiktionary/en\", \"license\": \"cc:by-sa/4.0\", \"sources\": [{\"contributor\": \"/s/resource/wiktionary/en\", \"process\": \"/s/process/wikiparsec/2\"}], \"weight\": 1.0}\\n',\n", | |
" 'rel': '/r/Synonym',\n", | |
" 'weight': 1.0,\n", | |
" 'arg1': \"/c/en/i_don't_care\",\n", | |
" 'arg2': '/c/ru/мне_без_разницы',\n", | |
" 'full_rel': \"/a/[/r/Synonym/,/c/en/i_don't_care/,/c/ru/мне_без_разницы/]\",\n", | |
" 'lang': 'en/ru'}" | |
] | |
}, | |
"execution_count": 502, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"random.choice(cn_ru)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 158, | |
"id": "61ab6fb3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('/r/ExternalURL', 585310),\n", | |
" ('/r/FormOf', 344356),\n", | |
" ('/r/RelatedTo', 234778),\n", | |
" ('/r/Synonym', 155380),\n", | |
" ('/r/DerivedFrom', 28476),\n", | |
" ('/r/EtymologicallyRelatedTo', 17807),\n", | |
" ('/r/HasContext', 12816),\n", | |
" ('/r/EtymologicallyDerivedFrom', 10355),\n", | |
" ('/r/SymbolOf', 5886),\n", | |
" ('/r/IsA', 1682),\n", | |
" ('/r/Antonym', 1365),\n", | |
" ('/r/DistinctFrom', 846),\n", | |
" ('/r/SimilarTo', 171),\n", | |
" ('/r/PartOf', 16),\n", | |
" ('/r/MannerOf', 1)]" | |
] | |
}, | |
"execution_count": 158, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Counter(item['rel'] for item in cn_ru).most_common(20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 180, | |
"id": "894edd09", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"793022" | |
] | |
}, | |
"execution_count": 180, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"subset = [item for item in cn_ru if item['rel'] != '/r/ExternalURL' and 'ru' in item['lang'].split('/')]\n", | |
"len(subset)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 271, | |
"id": "485ea93b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"480208\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('/r/FormOf', 343855),\n", | |
" ('/r/RelatedTo', 91119),\n", | |
" ('/r/DerivedFrom', 27581),\n", | |
" ('/r/Synonym', 11167),\n", | |
" ('/r/EtymologicallyRelatedTo', 2469),\n", | |
" ('/r/IsA', 1676),\n", | |
" ('/r/Antonym', 1310),\n", | |
" ('/r/DistinctFrom', 842),\n", | |
" ('/r/SimilarTo', 171),\n", | |
" ('/r/PartOf', 16),\n", | |
" ('/r/EtymologicallyDerivedFrom', 1),\n", | |
" ('/r/MannerOf', 1)]" | |
] | |
}, | |
"execution_count": 271, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"subset = [item for item in cn_ru if item['rel'] != '/r/ExternalURL' and item['lang'] == 'ru']\n", | |
"print(len(subset))\n", | |
"Counter(item['rel'] for item in subset).most_common(20)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e15574c8", | |
"metadata": {}, | |
"source": [ | |
"```\n", | |
"[('/r/FormOf', 343855), - чисто грамматика\n", | |
" ('/r/RelatedTo', 91119), - обычно однокоренные слова\n", | |
" ('/r/DerivedFrom', 27581), - однокоренные слова\n", | |
" + ('/r/Synonym', 11167), - в основно, нормальные синонимы\n", | |
" ('/r/EtymologicallyRelatedTo', 2469),\n", | |
" + ('/r/IsA', 1676), - норм гиперонимы\n", | |
" + ('/r/Antonym', 1310), - норм антонимы\n", | |
" + ('/r/DistinctFrom', 842), - норм когипонимы и квази-антонимы\n", | |
" + ('/r/SimilarTo', 171), - норм когипонимы\n", | |
" ('/r/PartOf', 16),\n", | |
" ('/r/EtymologicallyDerivedFrom', 1),\n", | |
" ('/r/MannerOf', 1)]\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 928, | |
"id": "2a3c3b6c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"good_russian_relations = ['/r/Synonym', '/r/IsA', '/r/Antonym', '/r/DistinctFrom', '/r/SimilarTo']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 925, | |
"id": "1705261c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"subset = [item for item in cn_ru if item['rel'] == '/r/SimilarTo' and item['lang'] == 'ru']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 926, | |
"id": "c3a52fc9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'sentence': '',\n", | |
" 'extra_info': '{\"dataset\": \"/d/wiktionary/en\", \"license\": \"cc:by-sa/4.0\", \"sources\": [{\"contributor\": \"/s/resource/wiktionary/en\", \"process\": \"/s/process/wikiparsec/2\"}], \"weight\": 1.0}\\n',\n", | |
" 'rel': '/r/SimilarTo',\n", | |
" 'weight': 1.0,\n", | |
" 'arg1': '/c/ru/двоечник/n',\n", | |
" 'arg2': '/c/ru/четвёрочник',\n", | |
" 'full_rel': '/a/[/r/SimilarTo/,/c/ru/двоечник/n/,/c/ru/четвёрочник/]',\n", | |
" 'lang': 'ru'}" | |
] | |
}, | |
"execution_count": 926, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"random.choice(subset)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 726, | |
"id": "85c16c5a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"234778\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[('/r/RelatedTo', 234778)]" | |
] | |
}, | |
"execution_count": 726, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"subset = [item for item in cn_ru if item['rel'] == ]\n", | |
"print(len(subset))\n", | |
"Counter(item['rel'] for item in subset).most_common(20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 504, | |
"id": "88109998", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from collections import defaultdict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 506, | |
"id": "68b1b725", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "6c6026dd1b284557996281e2172acf35", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/1399245 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"104937\n" | |
] | |
} | |
], | |
"source": [ | |
"russian_synonyms = defaultdict(set)\n", | |
"\n", | |
"for item in tqdm(cn_ru):\n", | |
" if item['rel'] != '/r/Synonym':\n", | |
" continue\n", | |
" ru1 = 'ru' in item['arg1'].split('/')\n", | |
" ru2 = 'ru' in item['arg2'].split('/')\n", | |
" if ru1 and not ru2:\n", | |
" russian_synonyms[item['arg2']].add(item['arg1'])\n", | |
" elif ru2 and not ru1:\n", | |
" russian_synonyms[item['arg1']].add(item['arg2'])\n", | |
"print(len(russian_synonyms))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 637, | |
"id": "eb3fffbe", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "6096786923a64cf0af1106509afd3bc2", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/1399245 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"957314" | |
] | |
}, | |
"execution_count": 637, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"russian_counts = Counter()\n", | |
"for item in tqdm(cn_ru):\n", | |
" for a in [item['arg1'], item['arg2']]:\n", | |
" if 'ru' in a.split('/'):\n", | |
" russian_counts[a] += 1\n", | |
"len(russian_counts)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 638, | |
"id": "edaf8d1c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('/c/ru/вода/n', 256),\n", | |
" ('/c/ru/россия/n', 194),\n", | |
" ('/c/ru/спасибо', 167),\n", | |
" ('/c/ru/один', 162),\n", | |
" ('/c/ru/рвать', 155),\n", | |
" ('/c/ru/ходить/v', 144),\n", | |
" ('/c/ru/бить/v', 141),\n", | |
" ('/c/ru/читать/v', 140),\n", | |
" ('/c/ru/валить', 129),\n", | |
" ('/c/ru/двигать', 122)]" | |
] | |
}, | |
"execution_count": 638, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"russian_counts.most_common(10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1039, | |
"id": "aad5e1bf", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['/c/ru/полицай',\n", | |
" '/c/ru/милиционер',\n", | |
" '/c/ru/полицейский',\n", | |
" '/c/ru/коп',\n", | |
" '/c/ru/мент']" | |
] | |
}, | |
"execution_count": 1039, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"def concept_to_rus(concept, max_size=5, sample=True):\n", | |
" if concept not in russian_synonyms:\n", | |
" return []\n", | |
" items = list(russian_synonyms[concept])\n", | |
" if not sample or len(items) <= max_size:\n", | |
" return sorted(items, key=lambda x: russian_counts[x], reverse=True)[:max_size]\n", | |
" p = np.array([russian_counts[item] for item in items])\n", | |
" p = p / p.sum()\n", | |
" return [items[i] for i in np.random.choice(len(items), size=max_size, replace=False, p=p)]\n", | |
"\n", | |
"concept_to_rus('/c/en/police_officer/n')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 511, | |
"id": "06f77bf3", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "38369a92df30412bb0571e636c9cd1a6", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"0it [00:00, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"cn_translatable = []\n", | |
"\n", | |
"tq = tqdm(dataset['train'])\n", | |
"for i, item in enumerate(tq):\n", | |
" if i % 1000 == 0:\n", | |
" tq.set_description(f'{len(cn_translatable)} / {i} / {item[\"lang\"]}')\n", | |
" if item['arg1'] in russian_synonyms and item['arg2'] in russian_synonyms:\n", | |
" cn_translatable.append(item)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 621, | |
"id": "fc91e609", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('/r/RelatedTo', 118364),\n", | |
" ('/r/Synonym', 61264),\n", | |
" ('/r/IsA', 21010),\n", | |
" ('/r/HasContext', 18906),\n", | |
" ('/r/DerivedFrom', 5645),\n", | |
" ('/r/AtLocation', 4860),\n", | |
" ('/r/dbpedia/genre', 2134),\n", | |
" ('/r/dbpedia/genus', 1929),\n", | |
" ('/r/Antonym', 1608),\n", | |
" ('/r/EtymologicallyRelatedTo', 1569),\n", | |
" ('/r/PartOf', 1351),\n", | |
" ('/r/dbpedia/influencedBy', 1237),\n", | |
" ('/r/DistinctFrom', 1012),\n", | |
" ('/r/InstanceOf', 760),\n", | |
" ('/r/UsedFor', 682),\n", | |
" ('/r/SimilarTo', 669),\n", | |
" ('/r/dbpedia/language', 648),\n", | |
" ('/r/dbpedia/occupation', 610),\n", | |
" ('/r/dbpedia/field', 561),\n", | |
" ('/r/dbpedia/knownFor', 423),\n", | |
" ('/r/HasProperty', 330),\n", | |
" ('/r/CapableOf', 284),\n", | |
" ('/r/FormOf', 280),\n", | |
" ('/r/dbpedia/product', 278),\n", | |
" ('/r/dbpedia/capital', 248),\n", | |
" ('/r/EtymologicallyDerivedFrom', 228),\n", | |
" ('/r/Desires', 201),\n", | |
" ('/r/HasA', 145),\n", | |
" ('/r/NotDesires', 132),\n", | |
" ('/r/HasPrerequisite', 123),\n", | |
" ('/r/Causes', 119),\n", | |
" ('/r/MadeOf', 114),\n", | |
" ('/r/HasSubevent', 111),\n", | |
" ('/r/MotivatedByGoal', 110),\n", | |
" ('/r/dbpedia/leader', 78),\n", | |
" ('/r/CausesDesire', 58),\n", | |
" ('/r/CreatedBy', 45),\n", | |
" ('/r/LocatedNear', 18),\n", | |
" ('/r/HasFirstSubevent', 16),\n", | |
" ('/r/NotHasProperty', 16)]" | |
] | |
}, | |
"execution_count": 621, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Counter(item['rel'] for item in cn_translatable).most_common(40)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 596, | |
"id": "3530f329", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"/c/en/earth /c/en/mars\n", | |
"{'/c/ru/земля'} {'/c/ru/марс'}\n", | |
"\n", | |
"/c/en/globular_cluster /c/en/galaxy\n", | |
"{'/c/ru/шаровое_звёздное_скопление'} {'/c/ru/галактика'}\n", | |
"\n", | |
"/c/en/vacuum /c/en/outer_space\n", | |
"{'/c/ru/вакуум'} {'/c/ru/космическое_пространство'}\n", | |
"\n", | |
"/c/en/mosquito /c/en/water\n", | |
"{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n", | |
"\n", | |
"/c/en/helium /c/en/star\n", | |
"{'/c/ru/гелий'} {'/c/ru/звезда'}\n", | |
"\n", | |
"/c/en/mosquito /c/en/water\n", | |
"{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n", | |
"\n", | |
"/c/en/globular_cluster /c/en/galaxy\n", | |
"{'/c/ru/шаровое_звёздное_скопление'} {'/c/ru/галактика'}\n", | |
"\n", | |
"/c/en/beaver /c/en/dam\n", | |
"{'/c/ru/бобры'} {'/c/ru/плотина'}\n", | |
"\n", | |
"/c/en/squirrel /c/en/tree\n", | |
"{'/c/ru/беличьи'} {'/c/ru/дерево'}\n", | |
"\n", | |
"/c/en/mosquito /c/en/water\n", | |
"{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"subset = [item for item in cn_translatable if item['rel'] == '/r/LocatedNear']\n", | |
"for i in range(10):\n", | |
" item = random.choice(subset)\n", | |
" print(item['arg1'], item['arg2'])\n", | |
" print(russian_synonyms[item['arg1']], russian_synonyms[item['arg2']])\n", | |
" print()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 996, | |
"id": "5ecb328f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"relations_filter = {\n", | |
" '/r/RelatedTo': 'связано с', # произвольные ассоциации\n", | |
" '/r/Synonym': 'синоним', # обычно это одно и то же слово\n", | |
" '/r/IsA': '– это', # гипоним - гипероним\n", | |
" '/r/HasContext': 'относится к теме', # предмет - тематика\n", | |
" '/r/DerivedFrom': None, # этимология; описывает не понятие, а слово \n", | |
" '/r/AtLocation': 'находится в',\n", | |
" '/r/dbpedia/genre': 'относится к жанру', # музыкальные жанры\n", | |
" '/r/dbpedia/genus': 'относитcя к роду', # биологичесий род\n", | |
" '/r/Antonym': 'это антоним',\n", | |
" '/r/EtymologicallyRelatedTo': None,\n", | |
" '/r/PartOf': 'это часть',\n", | |
" '/r/dbpedia/influencedBy': 'испытал влияние', # связи творческих людей\n", | |
" '/r/DistinctFrom': 'отличается от', # ко-гипонимы\n", | |
" '/r/InstanceOf': '– это один из',\n", | |
" '/r/UsedFor': 'используется для',\n", | |
" '/r/SimilarTo': 'похож на',\n", | |
" '/r/dbpedia/language': None, # объект и его язык - но очень много путаницы\n", | |
" '/r/dbpedia/occupation': 'по профессии', # шумноватая категория\n", | |
" '/r/dbpedia/field': 'знаменит в области', # человек и область\n", | |
" '/r/dbpedia/knownFor': 'известен благодаря',\n", | |
" '/r/HasProperty': 'обладает свойством',\n", | |
" '/r/CapableOf': 'может', # предполагается, что слева - глагол\n", | |
" '/r/FormOf': None, # шумная категория\n", | |
" '/r/dbpedia/product': 'производит', # компания - продукт\n", | |
" '/r/dbpedia/capital': 'имеет в качестве столицы',\n", | |
" '/r/EtymologicallyDerivedFrom': None,\n", | |
" '/r/Desires': 'хочет',\n", | |
" '/r/HasA': 'имеет',\n", | |
" '/r/NotDesires': 'не хочет',\n", | |
" '/r/HasPrerequisite': 'требует', # для Х нужно Y\n", | |
" '/r/Causes': 'причиняет',\n", | |
" '/r/MadeOf': 'сделан из',\n", | |
" '/r/HasSubevent': None, # шумная категория\n", | |
" '/r/MotivatedByGoal': 'можно ради', # глагол, ради которого другой глагол\n", | |
" '/r/dbpedia/leader': None, # не очень информативно, территория - правитель\n", | |
" '/r/CausesDesire': 'вызывает желание',\n", | |
" '/r/CreatedBy': 'создан', # икс создан игреком (или иногда - из игрека)\n", | |
"}\n", | |
"filtered_relations = {k: v for k, v in relations_filter.items() if v}\n", | |
"assert len(filtered_relations) == len(set(filtered_relations.values()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 997, | |
"id": "6a3f5b41", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"нормальный связано с производная функции\n", | |
"бес связано с демонология\n", | |
"beretta ar 70 – это один из автомат\n", | |
"лёгкий сон связано с сон\n", | |
"досаждать синоним чума\n" | |
] | |
} | |
], | |
"source": [ | |
"for i in range(10):\n", | |
" item = random.choice(cn_translatable)\n", | |
" if item['rel'] not in filtered_relations:\n", | |
" continue\n", | |
" a1 = random.choice(list(russian_synonyms[item['arg1']])).split('/')[3].replace('_', ' ')\n", | |
" a2 = random.choice(list(russian_synonyms[item['arg2']])).split('/')[3].replace('_', ' ')\n", | |
" if a1 == a2:\n", | |
" continue\n", | |
" r = filtered_relations[item['rel']]\n", | |
" print(a1, r, a2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1047, | |
"id": "27bf9d2c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "fa09f870ebfb491a8dc9878805d768e7", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/1399245 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"15166\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "5e8f49245e12414bad3e7672ccd4b80e", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/1399245 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"168940\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "4296de5d01064b11a4bb61b9f508f3e5", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/248217 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"543200\n" | |
] | |
} | |
], | |
"source": [ | |
"russified_conceptnet = []\n", | |
"for item in tqdm(cn_ru):\n", | |
" if item['rel'] in good_russian_relations:\n", | |
" if 'ru' in item['arg1'].split('/') and 'ru' in item['arg2'].split('/'):\n", | |
" russified_conceptnet.append({\n", | |
" 'arg1': item['arg1'],\n", | |
" 'arg2': item['arg2'],\n", | |
" 'rel': item['rel'],\n", | |
" 'source': 'original',\n", | |
" 'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n", | |
" })\n", | |
"print(len(russified_conceptnet))\n", | |
"\n", | |
"for item in tqdm(cn_ru):\n", | |
" if item['rel'] not in filtered_relations:\n", | |
" continue\n", | |
" for a1 in concept_to_rus(item['arg1']) + ([item['arg1']] if 'ru' in item['arg1'].split('/') else []):\n", | |
" for a2 in concept_to_rus(item['arg2']) + ([item['arg2']] if 'ru' in item['arg2'].split('/') else []):\n", | |
" if a1 == a2:\n", | |
" continue\n", | |
" if a1 == item['arg1'] and a2 == item['arg2']:\n", | |
" continue\n", | |
" russified_conceptnet.append({\n", | |
" 'arg1': a1,\n", | |
" 'arg2': a2,\n", | |
" 'rel': item['rel'],\n", | |
" 'source': 'half_translated',\n", | |
" 'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n", | |
" })\n", | |
"print(len(russified_conceptnet))\n", | |
"\n", | |
"for item in tqdm(cn_translatable):\n", | |
" if item['rel'] not in filtered_relations:\n", | |
" continue\n", | |
" for a1 in concept_to_rus(item['arg1']):\n", | |
" for a2 in concept_to_rus(item['arg2']):\n", | |
" if a1 == a2:\n", | |
" continue\n", | |
" russified_conceptnet.append({\n", | |
" 'arg1': a1,\n", | |
" 'arg2': a2,\n", | |
" 'rel': item['rel'],\n", | |
" 'source': 'translated',\n", | |
" 'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n", | |
" })\n", | |
"print(len(russified_conceptnet))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1144, | |
"id": "ed2038b0", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'arg1': '/c/ru/бес',\n", | |
" 'arg2': '/c/ru/сверхъестественное',\n", | |
" 'rel': '/r/RelatedTo',\n", | |
" 'source': 'translated',\n", | |
" 'source_triplet': ['/c/en/demon/n', '/r/RelatedTo', '/c/en/supernatural']}" | |
] | |
}, | |
"execution_count": 1144, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"random.choice(russified_conceptnet)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1151, | |
"id": "b4c16907", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('/r/RelatedTo', 251251),\n", | |
" ('/r/Synonym', 168700),\n", | |
" ('/r/IsA', 54001),\n", | |
" ('/r/HasContext', 38493),\n", | |
" ('/r/AtLocation', 6791),\n", | |
" ('/r/Antonym', 4875),\n", | |
" ('/r/DistinctFrom', 2862),\n", | |
" ('/r/dbpedia/genre', 2270),\n", | |
" ('/r/dbpedia/genus', 1949),\n", | |
" ('/r/PartOf', 1947),\n", | |
" ('/r/SimilarTo', 1363),\n", | |
" ('/r/dbpedia/influencedBy', 1289),\n", | |
" ('/r/UsedFor', 1185),\n", | |
" ('/r/InstanceOf', 910),\n", | |
" ('/r/dbpedia/occupation', 751),\n", | |
" ('/r/HasProperty', 670),\n", | |
" ('/r/CapableOf', 571),\n", | |
" ('/r/dbpedia/field', 567),\n", | |
" ('/r/dbpedia/knownFor', 435),\n", | |
" ('/r/Desires', 330),\n", | |
" ('/r/dbpedia/product', 328),\n", | |
" ('/r/MotivatedByGoal', 271),\n", | |
" ('/r/dbpedia/capital', 259),\n", | |
" ('/r/HasA', 224),\n", | |
" ('/r/HasPrerequisite', 211),\n", | |
" ('/r/MadeOf', 183),\n", | |
" ('/r/Causes', 174),\n", | |
" ('/r/NotDesires', 165),\n", | |
" ('/r/CausesDesire', 114),\n", | |
" ('/r/CreatedBy', 61)]" | |
] | |
}, | |
"execution_count": 1151, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Counter(item['rel'] for item in russified_conceptnet).most_common(30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "a5b5ff8f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# todo: complete these templates using tools such as spacy/natasha and pymorphy2 and tons of rules\n", | |
"\n", | |
"def to_instrumental(x):\n", | |
" return x\n", | |
"\n", | |
"def adjust_short(v, x):\n", | |
" return v\n", | |
"\n", | |
"def adjust_verb(v, x):\n", | |
" return v\n", | |
"\n", | |
"def to_genitive(y):\n", | |
" return y\n", | |
"\n", | |
"def to_locative(y):\n", | |
" return y\n", | |
"\n", | |
"def to_accusative(y):\n", | |
" return y\n", | |
"\n", | |
"def to_dative(y):\n", | |
" return y\n", | |
"\n", | |
"def to_multiple(y):\n", | |
" return y" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1224, | |
"id": "a2d5db32", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"verbalizers = {\n", | |
" '/r/RelatedTo': lambda x, y: f'{x} {adjust_short(\"связан\", v)} с {to_instrumental(y)}',\n", | |
" '/r/Synonym': lambda x, y: f'\"{x}\" – синоним \"{to_genitive(y)}\"', # обычно это одно и то же слово\n", | |
" '/r/IsA': lambda x, y: f'{x} – это {y}', # '– это', # гипоним - гипероним\n", | |
" '/r/HasContext': lambda x, y: f'{x} относится к теме \"{y}\"', # предмет - тематика\n", | |
" '/r/AtLocation': lambda x, y: f'{x} находится в {to_locative(y)}',\n", | |
" '/r/dbpedia/genre': lambda x, y: f'{x} относится к жанру {y}', # музыкальные жанры\n", | |
" '/r/dbpedia/genus': lambda x, y: f'{x} относитcя к роду {y}', # биологичесий род\n", | |
" '/r/Antonym': lambda x, y: f'\"{x}\" – это антоним \"{y}\"',\n", | |
" '/r/PartOf': lambda x, y: f'{x} – это часть {to_genitive(y)}',\n", | |
" '/r/dbpedia/influencedBy': lambda x, y: f'{x} испытал влияние {to_genitive(y)}', # связи творческих людей\n", | |
" '/r/DistinctFrom': lambda x, y: f'{x} отличается от {to_genitive(y)}', # ко-гипонимы\n", | |
" '/r/InstanceOf': lambda x, y: f'{x} – это один из {to_genitive(to_multiple(y))}',\n", | |
" '/r/UsedFor': lambda x, y: f'{x} используется для {to_genitive(y)}',\n", | |
" '/r/SimilarTo': lambda x, y: f'{x} {adjust_short(\"похож\", x)} на {to_genitive(y)}',\n", | |
" '/r/dbpedia/occupation': lambda x, y: f'{x} по профессии {y}', # шумноватая категория\n", | |
" '/r/dbpedia/field': lambda x, y: f'{x} {adjust_short(\"знаменит\", x)} в области {to_genitive(y)}', # человек и область\n", | |
" '/r/dbpedia/knownFor': lambda x, y: f'{x} {adjust_short(\"известен\", x)} благодаря {to_dative(y)}',\n", | |
" '/r/HasProperty': lambda x, y: f'{x} обладает свойством {y}',\n", | |
" '/r/CapableOf': lambda x, y: f'{x} {adjust_verb(\"может\", x)} {y}', # предполагается, что слева - глагол\n", | |
" '/r/dbpedia/product': lambda x, y: f'{x} {adjust_verb(\"производит\", x)} {to_accusative(y)}', # компания - продукт\n", | |
" '/r/dbpedia/capital': lambda x, y: f'{y} – столица {to_genitive(y)}',\n", | |
" '/r/Desires': lambda x, y: f'{x} {adjust_verb(\"хочет\", x)} {to_accusative(y)}',\n", | |
" '/r/HasA': lambda x, y: f'{x} {adjust_verb(\"имеет\", x)} {to_accusative(y)}',\n", | |
" '/r/NotDesires': lambda x, y: f'{x} не {adjust_verb(\"хочет\", x)} {to_accusative(y)}',\n", | |
" '/r/HasPrerequisite': lambda x, y: f'{x} {adjust_verb(\"требует\", x)} {to_accusative(y)}', # для Х нужно Y\n", | |
" '/r/Causes': lambda x, y: f'{x} {adjust_verb(\"причиняет\", x)} {to_accusative(y)}',\n", | |
" '/r/MadeOf': lambda x, y: f'{x} {adjust_short(\"сделан\", x)} из {to_genitive(y)}',\n", | |
" '/r/MotivatedByGoal': lambda x, y: f'{x} можно ради {y}', # глагол, ради которого другой глагол\n", | |
" '/r/CausesDesire': lambda x, y: f'{x} вызывает желание {y}',\n", | |
" '/r/CreatedBy': lambda x, y: f'{x} {adjust_short(\"создан\", x)} {to_instrumental(y)}', # икс создан игреком (или иногда - из игрека)\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1260, | |
"id": "f1328ec7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for item in russified_conceptnet:\n", | |
" text = verbalizers[item['rel']](item['arg1'].split('/')[3].replace('_', ' '), item['arg2'].split('/')[3].replace('_', ' '))\n", | |
" item['sentence_dirty'] = text " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1295, | |
"id": "83b04788", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'arg1': '/c/ru/заглавная_страница', 'arg2': '/c/ru/автор/n', 'rel': '/r/RelatedTo', 'source': 'translated', 'source_triplet': ['/c/en/title_page/n', '/r/RelatedTo', '/c/en/author'], 'sentence_dirty': 'заглавная страница связан с автор'}\n", | |
"заглавная страница связан с автор\n" | |
] | |
} | |
], | |
"source": [ | |
"item = random.choice(russified_conceptnet)\n", | |
"print(item)\n", | |
"print(verbalizers[item['rel']](item['arg1'].split('/')[3].replace('_', ' '), item['arg2'].split('/')[3].replace('_', ' ')))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1296, | |
"id": "be9d88f1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "0d293709c4ac46219697b0d98cfed963", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
" 0%| | 0/543200 [00:00<?, ?it/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import json\n", | |
"with open('conceptnet5_russified.jsonl', 'w') as f:\n", | |
" for i, item in enumerate(tqdm(russified_conceptnet)):\n", | |
" json.dump(item, f, ensure_ascii=False)\n", | |
" f.write('\\n')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment