Created
October 14, 2019 12:16
-
-
Save zilista/047d9dd24157b05813497c9ff63432e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pymorphy2\n", | |
"import requests\n", | |
"import json\n", | |
"import re\n", | |
"\n", | |
"morph = pymorphy2.MorphAnalyzer()\n", | |
"\n", | |
"token = \"xxxxxxxxxxxxxxxxxxxxx\"\n", | |
"ser_id = 174 #174 #ID поисковой системы яндекс_спб" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"\n", | |
"Предварительно заполняем словарь key-value. \n", | |
"\n", | |
"Key - название группы, \n", | |
"\n", | |
"Value - список, куда добавляем ключи каждой группы\n", | |
"\n", | |
"На входе txt-файл ('data_tz.txt') в формате: Ключ -> Группа\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"item_dict = {}\n", | |
"flag = True\n", | |
"\n", | |
"with open('data_tz.txt') as file:\n", | |
" \n", | |
" for line in file:\n", | |
" \n", | |
" if flag:\n", | |
" flag = False # пропускаем строку заголовка\n", | |
" else:\n", | |
" line = line.strip().split('\t')\n", | |
" word = line[0]\n", | |
" group = line[1]\n", | |
"\n", | |
" if group not in item_dict:\n", | |
" item_dict[group] = []\n", | |
" item_dict[group].append(word)\n", | |
" else:\n", | |
" item_dict[group].append(word)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"end\n" | |
] | |
} | |
], | |
"source": [ | |
"group_word_count_dict = {}\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"Для каждого ключа обходим все ключевые фразы, разбиваем фразу на слова, нормализуем и добавляем в словарь\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"for key, value in item_dict.items():\n", | |
" group_word_count_dict.setdefault(key, {})\n", | |
" \n", | |
" for item in value:\n", | |
" \n", | |
" for word in re.findall(\"([А-ЯЁа-яё0-9]+(-[А-ЯЁа-яё0-9]+)*)\", item):\n", | |
" word = word[0]\n", | |
" word = morph.parse(word)[0].normal_form\n", | |
" form = morph.parse(word)[0].tag\n", | |
" \n", | |
" #не добавляем в словарь местоимение-существительное, предлог, союз, частица, междометие\n", | |
" if ('NPRO' in form or 'PREP' in form or 'CONJ' in form or 'PRCL' in form or 'INTJ' in form):\n", | |
" continue\n", | |
" else:\n", | |
" group_word_count_dict[key].setdefault(word, 0)\n", | |
" \n", | |
" if word in group_word_count_dict[key]:\n", | |
" group_word_count_dict[key][word] += 1\n", | |
" \n", | |
"#Сортировка получивщегося словаря\n", | |
"for key, value in group_word_count_dict.items():\n", | |
" sorted_group_word_count_dict = list(value.items())\n", | |
" sorted_group_word_count_dict.sort(key=lambda i: i[1], reverse=True)\n", | |
" sorted_group_word_count_dict = dict(sorted_group_word_count_dict) \n", | |
" group_word_count_dict[key] = sorted_group_word_count_dict\n", | |
"\n", | |
"# print(group_word_count_dict)\n", | |
"print('end')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Получаем данные по api и парсим полученный текст\n", | |
"\n", | |
"def megaindex_text_score(key):\n", | |
"\n", | |
" try:\n", | |
" url = 'http://api.megaindex.com/visrep/text_score?key={}&words={}&ser_id={}'.format(token, key, ser_id)\n", | |
" r = requests.get(url)\n", | |
" json_string = r.text\n", | |
" parsed_string = json.loads(json_string)['data']\n", | |
" list_base_urls = parsed_string['serps'][0]['base_urls']\n", | |
" symbols_median = parsed_string['old_api']['fragments']['long']['symbols_median']\n", | |
" \n", | |
" except Exception as ex_megaindex:\n", | |
" print(\"API megaindex request error: {error}\".format(error=ex_megaindex))\n", | |
" list_base_urls = ['Данные не получены']\n", | |
" symbols_median = 0\n", | |
" \n", | |
" \n", | |
" return(list_base_urls, symbols_median)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# чистим файл\n", | |
"f = open('group_word_lemma.txt', 'w')\n", | |
"f.write('Группа' +'\\t' + 'Конкуренты' +'\\t' + 'Символов ЗБП'+ '\\n')\n", | |
"f.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with open('group_word_lemma.txt' , 'a') as f:\n", | |
" \n", | |
" for key_dict, value_dict in group_word_count_dict.items():\n", | |
" \n", | |
" base_urls, symbols_median = megaindex_text_score(key_dict)\n", | |
" \n", | |
" if symbols_median < 8000: # Ограничение по количеству символов\n", | |
" \n", | |
" #print(key_dict, base_urls, symbols_median)\n", | |
" \n", | |
" f.write('{}\\t{}\\t{}\\n\\n'.format(key_dict, base_urls, symbols_median))\n", | |
" f.write('Лемма' +'\\t' + 'Количество повторений' + '\\n')\n", | |
" \n", | |
" for key, value in value_dict.items():\n", | |
" #print(key, value)\n", | |
" f.write('{}\\t{}\\n'.format(key, value))\n", | |
" f.write('\\n'+'\\n'+'\\n')\n", | |
"\n", | |
"print('end')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment