Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save zilista/adc935e6b050b5fc48675a5fbfabaf45 to your computer and use it in GitHub Desktop.
Save zilista/adc935e6b050b5fc48675a5fbfabaf45 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import pymorphy2\n",
"import re\n",
"import urllib.request as urlrequest\n",
"from urllib.parse import urlencode\n",
"from collections import Counter # Считаем частоты"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для работы понадобятся:\n",
" - token - токен api megaindex (https://ru.megaindex.com/api)\n",
" - ser_id - регион, по которому будут сниматься данные\n",
" - keywords_list - словарь ключевых слов, для которых ьудем получать данные"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"token = \"xxxxxxxxxxxxxxxxxxx\"\n",
"ser_id = 174 #ID поисковой системы яндекс_спб - 174\n",
"keywords_url_dict = {'основной маркерный запрос статьи №1':'url_основного маркерного запроса статьи №1', 'основной маркерный запрос статьи №2':'url_основного маркерного запроса статьи №2'}\n",
"\n",
"morph = pymorphy2.MorphAnalyzer() # создаем экземпляр pymorphy2, понадобится нам дальше для морфологического анализа"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для получения ключевых слов по нужным нам маркерным запросам будем использовать метод url_keywords API Serpstat (https://serpstat.com/ru/api/117-url-adresa-organicheskih-slov-urlkeywords/). Данный метод возвращает ключевые фразы в топе поисковой системы по заданному URL.\n",
"\n",
"Для работы берем пример кода из документации и оборачиваем его в функцию serpstat_keywords. Подставляем свои значения для \"token\" и региону \"se\", по которому будем получать данные. Получить список регионов можно здесь https://serpstat.com/ru/api/272-spisok-dostupnih-baz-databasesinfo/"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"def serpstat_keywords(url):\n",
" \n",
" host = 'http://api.serpstat.com/v3'\n",
" method = 'url_keywords'\n",
" params = {\n",
" 'query': '{}'.format(url), # string for get info\n",
" 'se': 'y_2', # string search engine\n",
" 'token': 'xxxxxxxxxxxxxxxxxxx', # string personal token\n",
" }\n",
"\n",
" api_url = \"{host}/{method}?{params}\".format(\n",
" host=host,\n",
" method=method,\n",
" params=urlencode(params)\n",
" )\n",
"\n",
" try:\n",
" json_data = urlrequest.urlopen(api_url).read()\n",
" except Exception as e0:\n",
" print(\"API request error: {error}\".format(error=e0))\n",
" pass\n",
" \n",
" data = json.loads(json_data)\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"Используя регулярное выражение разбиваем входную фразу на слова. \n",
"\n",
"Эти слова лемматизируем, проверяем на часть речи и добавляем в результирующий список.\n",
"\n",
"Возвращаем список.\n",
"\n",
"! Не забываем что pymorphy2 работает только с русским языком. \n",
"\n",
"Если в словосочетаниях будут фразы на другом языке, он их пропустит.\n",
"\n",
"\"\"\"\n",
"\n",
"def morph_word_lemma(key):\n",
" \n",
" meaningfullPoSes=['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'] # фильтруем граммемы https://pymorphy2.readthedocs.io/en/latest/user/grammemes.html\n",
" reswords=[]\n",
"\n",
" for word in re.findall(\"([А-ЯЁа-яё0-9]+(-[А-ЯЁа-яё0-9]+)*)\", key): # фразу бьем на слова\n",
" word = word[0]\n",
" word_normal_form = morph.parse(word)[0].normal_form\n",
" form = morph.parse(word)[0].tag\n",
" \n",
" if form.POS in meaningfullPoSes:\n",
" continue\n",
" else:\n",
" reswords.append(word_normal_form)\n",
" \n",
" return reswords"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"\n",
"Составляем словарь вида \"Лемма: [количество упоминаний леммы]\"\n",
"\n",
"\"\"\"\n",
"\n",
"def counter_dict_list(list_values):\n",
" \n",
" list_values_all=[]\n",
" \n",
" for item in list_values:\n",
" list_values_word_lemma = morph_word_lemma(item)\n",
" \n",
" for item in list_values_word_lemma:\n",
" list_values_all.append(item)\n",
" dict_values_word_lemma = dict(Counter(list_values_all))\n",
" \n",
" sorted_dict_values_word_lemma = list(dict_values_word_lemma.items())\n",
" sorted_dict_values_word_lemma.sort(key=lambda i: i[1], reverse=True)\n",
" sorted_dict_values_word_lemma = dict(sorted_dict_values_word_lemma)\n",
" \n",
" return (sorted_dict_values_word_lemma)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# чистим файл и записываем строку заголовка\n",
"f = open('api.txt', 'w')\n",
"f.write(\"key\"+'\\t'+\"compare_urls\" + '\\t' + \"base_urls\"+ '\\t' + \"relevance\" + '\\t' + 'symbols median' + '\\t' +'symbols text'+ '\\t' + 'symbols diff'+ '\\t'+ 'words median' + '\\t' + 'words value text' + '\\t' + 'words diff' + '\\n')\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"#Получаем данные по api megaindex и парсим полученный текст\n",
"\n",
"def megaindex_text_score(key, key_url):\n",
" \n",
" keyword_list = []\n",
" uniq_keyword_list = []\n",
" \n",
" try:\n",
" url = 'http://api.megaindex.com/visrep/text_score?key={}&words={}&ser_id={}&compare_urls={}'.format(token, key, ser_id, key_url)\n",
" r = requests.get(url)\n",
" json_string = r.text\n",
" parsed_string = json.loads(json_string)['data']\n",
" list_base_urls = parsed_string['serps'][0]['base_urls']\n",
" relevance = parsed_string['serps'][0]['compare_urls'][0]['relevance']*100\n",
" symbols_median = parsed_string['old_api']['fragments']['long']['symbols_median']\n",
" symbols_text = parsed_string['old_api']['compare_docs'][key_url]['fragments']['long']['symbols']\n",
" symbols_diff = symbols_median - symbols_text\n",
" words_median = parsed_string['serps'][0]['compare_urls'][0]['diffs']['word_count']['long']['median']\n",
" words_value_text = parsed_string['serps'][0]['compare_urls'][0]['diffs']['word_count']['long']['value']\n",
" words_diff = parsed_string['serps'][0]['compare_urls'][0]['diffs']['word_count']['long']['diff']\n",
" \n",
" except Exception as ex_megaindex:\n",
" print(\"API megaindex request error: {error}\".format(error=ex_megaindex))\n",
" list_base_urls = []\n",
" symbols_median = 'Данные не получены'\n",
" \n",
" \n",
" for url in list_base_urls:\n",
" url = url.replace('http:', 'https:')\n",
" data = serpstat_keywords(url)\n",
" \n",
" try:\n",
" for keyword in data['result']['hits']:\n",
" keyword_list.append(keyword['keyword'])\n",
" except:\n",
" pass\n",
" \n",
" for item in set(keyword_list):\n",
" uniq_keyword_list.append(item)\n",
" \n",
" count_lemma = counter_dict_list(uniq_keyword_list)\n",
" \n",
" return (list_base_urls, relevance, symbols_median, symbols_text, symbols_diff, words_median, words_value_text, words_diff, count_lemma)\n"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Всего будет сгенерировано ТЗ: 3\n",
"end\n"
]
}
],
"source": [
"print ('Всего будет сгенерировано ТЗ: ', len(keywords_url_dict))\n",
"\n",
"for keywords in keywords_url_dict.keys():\n",
" #print(keywords, keywords_url_dict[keywords])\n",
" \n",
" try:\n",
" list_base_urls, relevance, symbols_median, symbols_text, symbols_diff, words_median, words_value_text, words_diff, count_lemma = megaindex_text_score(keywords, keywords_url_dict[keywords])\n",
" except Exception as ex:\n",
" pass\n",
" print(f'Errow: {ex}')\n",
" \n",
" with open('api.txt', 'a') as f:\n",
" f.write('{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t{}\\t\\n\\n'.format(keywords, keywords_url_dict[keywords], list_base_urls, relevance, symbols_median, symbols_text, symbols_diff, words_median, words_value_text, words_diff)) \n",
" f.write('Лемма' +'\\t' + 'Количество повторений' + '\\n')\n",
" \n",
" for key, value in count_lemma.items():\n",
" f.write('{}\\t{}\\n'.format(key, value))\n",
" f.write('\\n'+'\\n'+'\\n')\n",
"\n",
"print ('end')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment