Last active
January 11, 2022 10:20
-
-
Save andymithamclarke/eb0e80a598354449c72bff7b607e4863 to your computer and use it in GitHub Desktop.
Notebook for scraping reviews from Trustpilot
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Trustpilot Scraping Script\n", | |
"Notebook created Jan, 2022 by <a href=\"https://www.linkedin.com/in/andy-clarke-media/\" target=\"_blank\">Andy Clarke - Data Journalist @ Graphext</a>\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"## Imports\n", | |
"import math\n", | |
"import csv\n", | |
"import time\n", | |
"import json\n", | |
"import requests\n", | |
"from lxml import html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Scraper set for http://www.trustpilot.com/review/n26.com - saving result to test_n26.csv\n" | |
] | |
} | |
], | |
"source": [ | |
"## Configurations\n", | |
"\n", | |
"# Trustpilot review page\n", | |
"basePage = 'http://www.trustpilot.com/review/'\n", | |
"\n", | |
"# Change the name of your \n", | |
"reviewSite = 'n26.com'\n", | |
"reviewPage = basePage + reviewSite\n", | |
"\n", | |
"# Data file to save to | will save data row by row\n", | |
"datafile = 'test_n26.csv'\n", | |
"\n", | |
"# Trustpilot default \n", | |
"resultsPerPage = 20 \n", | |
"\n", | |
"print('Scraper set for ' + reviewPage + ' - saving result to ' + datafile)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n", | |
" InsecureRequestWarning)\n" | |
] | |
} | |
], | |
"source": [ | |
"## Count amount of pages to scrape\n", | |
"\n", | |
"# Get page, skipping HTTPS as it gives certificate errors\n", | |
"page = requests.get(reviewPage, verify=False)\n", | |
"tree = html.fromstring(page.content)\n", | |
"\n", | |
"# Amount of chunks to consider for displaying processing output \n", | |
"# For ex. 10 means output progress for every 10th of the data\n", | |
"tot_chunks = 20\n", | |
"\n", | |
"# Throttling to avoid spamming page with requests\n", | |
"# With sleepTime seconds between every page request\n", | |
"throttle = True\n", | |
"sleepTime = 1\n", | |
"\n", | |
"# Adjust number of pages to scrape (use the number inside the last pagination element)\n", | |
"pages = 0" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Processing..\n" | |
] | |
}, | |
{ | |
"ename": "KeyboardInterrupt", | |
"evalue": "", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-39-a024e3f3794e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# Sleep if throttle enabled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mif\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mthrottle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msleepTime\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreviewPage\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'?page='\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
] | |
} | |
], | |
"source": [ | |
"## Main scraping section\n", | |
"\n", | |
"with open(datafile, 'w', newline='', encoding='utf8') as csvfile:\n", | |
" \n", | |
" # Tab delimited to allow for special characters\n", | |
" datawriter = csv.writer(csvfile, delimiter=',')\n", | |
" datawriter.writerow(['published_date', 'country_code', 'title', 'body', 'rating'])\n", | |
" print('Processing..')\n", | |
" for i in range(1,pages+1):\n", | |
" \n", | |
" # Sleep if throttle enabled\n", | |
" if(throttle): time.sleep(sleepTime)\n", | |
"\n", | |
" page = requests.get(reviewPage + '?page=' + str(i))\n", | |
" tree = html.fromstring(page.content)\n", | |
" \n", | |
" # Each item below scrapes a pages review titles, bodies and ratings\n", | |
" \n", | |
" # This element selector will update\n", | |
" script_bodies = tree.xpath(\"//script[starts-with(@id, '__NEXT_DATA__')]\")\n", | |
" for idx,elem in enumerate(script_bodies):\n", | |
" curr_item = json.loads(elem.text_content())\n", | |
" \n", | |
" reviews_list = curr_item['props']['pageProps']['reviews']\n", | |
" \n", | |
" # Capture Data\n", | |
" for rev in reviews_list:\n", | |
" country_code = rev['consumer']['countryCode']\n", | |
" published_date = rev['dates']['publishedDate']\n", | |
" title = rev['title']\n", | |
" body = rev['text']\n", | |
" rating = rev['rating']\n", | |
" \n", | |
" #print(published_date, country_code, title, body, rating)\n", | |
" \n", | |
" datawriter.writerow([published_date, country_code, title, body, rating])\n", | |
" \n", | |
" print('Processed ratings.. Finished!')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment