Created
October 14, 2019 13:15
-
-
Save zilista/adc935e6b050b5fc48675a5fbfabaf45 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import json\n", | |
"import pymorphy2\n", | |
"import re\n", | |
"import urllib.request as urlrequest\n", | |
"from urllib.parse import urlencode\n", | |
"from collections import Counter # Считаем частоты" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Для работы понадобятся:\n", | |
" - token - токен api megaindex (https://ru.megaindex.com/api)\n", | |
" - ser_id - регион, по которому будут сниматься данные\n", | |
" - keywords_list - словарь ключевых слов, для которых ьудем получать данные" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"token = \"xxxxxxxxxxxxxxxxxxx\"\n", | |
"ser_id = 174 #ID поисковой системы яндекс_спб - 174\n", | |
"keywords_url_dict = {'основной маркерный запрос статьи №1':'url_основного маркерного запроса статьи №1', 'основной маркерный запрос статьи №2':'url_основного маркерного запроса статьи №2'}\n", | |
"\n", | |
"morph = pymorphy2.MorphAnalyzer() # создаем экземпляр pymorphy2, понадобится нам дальше для морфологического анализа" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Для получения ключевых слов по нужным нам маркерным запросам будем использовать метод url_keywords API Serpstat (https://serpstat.com/ru/api/117-url-adresa-organicheskih-slov-urlkeywords/). Данный метод возвращает ключевые фразы в топе поисковой системы по заданному URL.\n", | |
"\n", | |
"Для работы берем пример кода из документации и оборачиваем его в функцию serpstat_keywords. Подставляем свои значения для \"token\" и региону \"se\", по которому будем получать данные. Получить список регионов можно здесь https://serpstat.com/ru/api/272-spisok-dostupnih-baz-databasesinfo/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def serpstat_keywords(url):\n", | |
" \n", | |
" host = 'http://api.serpstat.com/v3'\n", | |
" method = 'url_keywords'\n", | |
" params = {\n", | |
" 'query': '{}'.format(url), # string for get info\n", | |
" 'se': 'y_2', # string search engine\n", | |
" 'token': 'xxxxxxxxxxxxxxxxxxx', # string personal token\n", | |
" }\n", | |
"\n", | |
" api_url = \"{host}/{method}?{params}\".format(\n", | |
" host=host,\n", | |
" method=method,\n", | |
" params=urlencode(params)\n", | |
" )\n", | |
"\n", | |
" try:\n", | |
" json_data = urlrequest.urlopen(api_url).read()\n", | |
" except Exception as e0:\n", | |
" print(\"API request error: {error}\".format(error=e0))\n", | |
" pass\n", | |
" \n", | |
" data = json.loads(json_data)\n", | |
"\n", | |
" return data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"Используя регулярное выражение разбиваем входную фразу на слова. \n", | |
"\n", | |
"Эти слова лемматизируем, проверяем на часть речи и добавляем в результирующий список.\n", | |
"\n", | |
"Возвращаем список.\n", | |
"\n", | |
"! Не забываем что pymorphy2 работает только с русским языком. \n", | |
"\n", | |
"Если в словосочетаниях будут фразы на другом языке, он их пропустит.\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"def morph_word_lemma(key):\n", | |
" \n", | |
" meaningfullPoSes=['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'] # фильтруем граммемы https://pymorphy2.readthedocs.io/en/latest/user/grammemes.html\n", | |
" reswords=[]\n", | |
"\n", | |
" for word in re.findall(\"([А-ЯЁа-яё0-9]+(-[А-ЯЁа-яё0-9]+)*)\", key): # фразу бьем на слова\n", | |
" word = word[0]\n", | |
" word_normal_form = morph.parse(word)[0].normal_form\n", | |
" form = morph.parse(word)[0].tag\n", | |
" \n", | |
" if form.POS in meaningfullPoSes:\n", | |
" continue\n", | |
" else:\n", | |
" reswords.append(word_normal_form)\n", | |
" \n", | |
" return reswords" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"\n", | |
"Составляем словарь вида \"Лемма: [количество упоминаний леммы]\"\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"def counter_dict_list(list_values):\n", | |
" \n", | |
" list_values_all=[]\n", | |
" \n", | |
" for item in list_values:\n", | |
" list_values_word_lemma = morph_word_lemma(item)\n", | |
" \n", | |
" for item in list_values_word_lemma:\n", | |
" list_values_all.append(item)\n", | |
" dict_values_word_lemma = dict(Counter(list_values_all))\n", | |
" \n", | |
" sorted_dict_values_word_lemma = list(dict_values_word_lemma.items())\n", | |
" sorted_dict_values_word_lemma.sort(key=lambda i: i[1], reverse=True)\n", | |
" sorted_dict_values_word_lemma = dict(sorted_dict_values_word_lemma)\n", | |
" \n", | |
" return (sorted_dict_values_word_lemma)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# чистим файл и записываем строку заголовка\n", | |
"f = open('api.txt', 'w')\n", | |
"f.write(\"key\"+'\\t'+\"compare_urls\" + '\\t' + \"base_urls\"+ '\\t' + \"relevance\" + '\\t' + 'symbols median' + '\\t' +'symbols text'+ '\\t' + 'symbols diff'+ '\\t'+ 'words median' + '\\t' + 'words value text' + '\\t' + 'words diff' + '\\n')\n", | |
"f.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Получаем данные по api megaindex и парсим полученный текст\n", | |
"\n", | |
"def megaindex_text_score(key, key_url):\n", | |
" \n", | |
" keyword_list = []\n", | |
" uniq_keyword_list = []\n", | |
" \n", | |
" try:\n", | |
" url = 'http://api.megaindex.com/visrep/text_score?key={}&words={}&ser_id={}&compare_urls={}'.format(token, key, ser_id, key_url)\n", | |
" r = requests.get(url)\n", | |
" json_string = r.text\n", | |
" parsed_string = json.loads(json_string)['data']\n", | |
" list_base_urls = parsed_string['serps'][0]['base_urls']\n", | |
" relevance = parsed_string['serps'][0]['compare_urls'][0]['relevance']*100\n", | |
" symbols_median = parsed_string['old_api']['fragments']['long']['symbols_median']\n", | |
" symbols_text = parsed_string['old_api']['compare_docs'][key_url]['fragments']['long']['symbols']\n", | |
" symbols_diff = symbols_median - symbols_text\n", | |
" words_median = parsed_string['serps'][0]['compare_urls'][0]['diffs']['word_count']['long']['median']\n", | |
" words_value_text = parsed_string['serps'][0]['compare_urls'][0]['diffs']['word_count']['long']['value']\n", | |
" words_diff = parsed_string['serps'][0]['compare_urls'][0]['diffs']['word_count']['long']['diff']\n", | |
" \n", | |
" except Exception as ex_megaindex:\n", | |
" print(\"API megaindex request error: {error}\".format(error=ex_megaindex))\n", | |
" list_base_urls = []\n", | |
" symbols_median = 'Данные не получены'\n", | |
" \n", | |
" \n", | |
" for url in list_base_urls:\n", | |
" url = url.replace('http:', 'https:')\n", | |
" data = serpstat_keywords(url)\n", | |
" \n", | |
" try:\n", | |
" for keyword in data['result']['hits']:\n", | |
" keyword_list.append(keyword['keyword'])\n", | |
" except:\n", | |
" pass\n", | |
" \n", | |
" for item in set(keyword_list):\n", | |
" uniq_keyword_list.append(item)\n", | |
" \n", | |
" count_lemma = counter_dict_list(uniq_keyword_list)\n", | |
" \n", | |
" return (list_base_urls, relevance, symbols_median, symbols_text, symbols_diff, words_median, words_value_text, words_diff, count_lemma)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Всего будет сгенерировано ТЗ: 3\n", | |
"end\n" | |
] | |
} | |
], | |
"source": [ | |
"print ('Всего будет сгенерировано ТЗ: ', len(keywords_url_dict))\n", | |
"\n", | |
"for keywords in keywords_url_dict.keys():\n", | |
" #print(keywords, keywords_url_dict[keywords])\n", | |
" \n", | |
" try:\n", | |
" list_base_urls, relevance, symbols_median, symbols_text, symbols_diff, words_median, words_value_text, words_diff, count_lemma = megaindex_text_score(keywords, keywords_url_dict[keywords])\n", | |
" except Exception as ex:\n", | |
" pass\n", | |
" print(f'Errow: {ex}')\n", | |
" \n", | |
" with open('api.txt', 'a') as f:\n", | |
" f.write('{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t\\n\\n'.format(keywords, keywords_url_dict[keywords], list_base_urls, relevance, symbols_median, symbols_text, symbols_diff, words_median, words_value_text, words_diff)) \n", | |
" f.write('Лемма' +'\\t' + 'Количество повторений' + '\\n')\n", | |
" \n", | |
" for key, value in count_lemma.items():\n", | |
" f.write('{}\\t{}\\n'.format(key, value))\n", | |
" f.write('\\n'+'\\n'+'\\n')\n", | |
"\n", | |
"print ('end')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment