Last active
June 24, 2019 04:09
-
-
Save alucard001/45bbb9b9b334a6ddc260412a6289396f to your computer and use it in GitHub Desktop.
Google Search API Working Example - Python 3 + MSSQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
''' | |
This program is used to get all search terms in Google search console and save it in MSSQL DB. | |
Please note: Just using this .py is not enough. You need to create a Google service account in google API console and | |
connect that service account to Google search console property. | |
Points to note: | |
=============== | |
- By default (and cannot be changed), as of this writing, Google Search Console would not provide any | |
searchdata in the last 3 days. | |
- For example, today is 24 May 2019, you CANNOT get any search term result in 23 May, 22 May and 21 May. The latest | |
data is 20 May. In case if I make the above text misunderstood, of course it did not include 24 May (Today). | |
- Read the command line arguments description. It is important. | |
''' | |
import argparse | |
import os, time | |
import sys | |
import json | |
from google.oauth2 import service_account | |
import googleapiclient.discovery | |
import textwrap | |
import datetime | |
import pyodbc | |
import urllib.parse | |
# https://stackoverflow.com/a/1301528/1802483 | |
os.environ['TZ'] = "Asia/Hong_Kong" | |
time.tzset() | |
# How to do server to server connection using OAuth | |
# https://developers.google.com/api-client-library/python/auth/service-accounts | |
# API Reference | |
# https://developers.google.com/apis-explorer/?hl=zh-TW#p/webmasters/v3/webmasters.searchanalytics.query | |
# Required: | |
# pip3 install google-auth google-auth-httplib2 google-api-python-client | |
# https://github.com/mkleehammer/pyodbc/wiki/Connecting-to-SQL-Server-from-Windows#using-an-odbc-driver | |
# https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server?view=sql-server-2017 | |
# Declare command-line flags. | |
argparser = argparse.ArgumentParser(add_help=False) | |
argparser.add_argument('property_uri', type=str, help=('Site or app URI to query data for (including trailing slash).')) | |
argparser.add_argument('--days', default=1, type=int, help=('Number of days to get data prior to today. 1 means today(Default), 2 means yesterday and today')) | |
def main(argv): | |
args = argparser.parse_args() | |
property_uri = args.property_uri | |
days = args.days | |
# https://stackoverflow.com/a/993367/1802483 | |
today = datetime.datetime.today() | |
date_list = [today - datetime.timedelta(days=i) for i in reversed(range(0, days))] | |
SCOPES = ['https://www.googleapis.com/auth/webmasters.readonly'] | |
# Why define a table name? Because this script may be used by different websites, | |
# which means it will have different tables | |
tablename = "" | |
if(property_uri == "https://website1.com"): | |
tablename = "[ba_analysis].[dbo].[website1]" | |
SERVICE_ACCOUNT_FILE = '/root/googlesearch/website1_service_account_secret_download_from_google_api_console.json' | |
elif(property_uri == "https://website2.com"): | |
tablename = "[ba_analysis].[dbo].[website2]" | |
SERVICE_ACCOUNT_FILE = '/root/googlesearch/website2_service_account_secret_download_from_google_api_console.json' | |
credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES) | |
webmaster_service = googleapiclient.discovery.build("webmasters", "v3", credentials=credentials) | |
# site_list = webmaster_service.sites().list().execute() | |
# print(site_list); return | |
for current_date in date_list: | |
data_date = start_date = end_date = current_date.strftime("%Y-%m-%d") | |
request = { | |
'startDate': start_date, | |
'endDate': end_date, | |
"dimensions": ["page", "query", "device"], | |
"searchType": "web" | |
} | |
result = webmaster_service.searchanalytics().query(siteUrl=property_uri, body=request).execute() | |
if(len(result.get('rows', [])) == 0): | |
continue | |
# https://github.com/mkleehammer/pyodbc/wiki/Getting-started | |
cnxn = pyodbc.connect(driver='{ODBC Driver 17 for SQL Server}', server='mssql_server_ip', database='db_name', uid='user', pwd='password') | |
cnxn.setencoding(encoding='utf-8') | |
# Create a cursor from the connection | |
cursor = cnxn.cursor() | |
for row in result['rows']: | |
## Debug | |
# print("SEARCH_QUERY: " + row["keys"][1]) | |
# print("LANDING_PAGE: " + row["keys"][0]) | |
# print("DEVICE: " + row["keys"][2]) | |
# print("IMPRESSIONS: " + str(row["impressions"])) | |
# print("CLICKS: " + str(row["clicks"])) | |
# print("AVERAGE_POSITION: " + str(row["position"])) | |
# print("CLICK_THROUGH_RATE: " + str(row["ctr"])) | |
# print("\n") | |
search_query = row["keys"][1] | |
landing_page = row["keys"][0] | |
device = row["keys"][2] | |
impressions = str(row["impressions"]) | |
clicks = str(row["clicks"]) | |
average_position = str(row["position"]) | |
click_through_rate = str(row["ctr"]) | |
if(tablename != ""): | |
try: | |
search_query = search_query.replace("'", r"''") | |
search_query = search_query.replace('"', r'\"') | |
sql = "INSERT INTO {:s} (search_query, landing_page, device, impressions, clicks, average_position, click_through_rate, data_date) VALUES(N'{:s}', N'{:s}', N'{:s}', N'{:s}', N'{:s}', N'{:s}', N'{:s}', N'{:s}')".format( | |
tablename, search_query, urllib.parse.unquote(landing_page, encoding='utf-8', errors='replace'), device, impressions, clicks, average_position, click_through_rate, data_date | |
) | |
#print(sql) | |
cursor.execute(sql) | |
except pyodbc.ProgrammingError as ex: | |
print("Exception: ") | |
print(ex) | |
print("Error in SQL: " + sql) | |
return | |
cnxn.commit() | |
if __name__ == '__main__': | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
At the time of this writing, Google has been updated their "time interval", which means you will NOT get any data in the past 4 days.
Now the time interval is increased to 5 days, which means you will not get any data in the past 5 days, including today.
So if you want to use this script, the
--days
parameter is 6 in order to get data.