Skip to content

Instantly share code, notes, and snippets.

@andymithamclarke
Last active January 11, 2022 10:20
Show Gist options
  • Save andymithamclarke/eb0e80a598354449c72bff7b607e4863 to your computer and use it in GitHub Desktop.
Save andymithamclarke/eb0e80a598354449c72bff7b607e4863 to your computer and use it in GitHub Desktop.
Notebook for scraping reviews from Trustpilot
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Trustpilot Scraping Script\n",
"Notebook created Jan, 2022 by <a href=\"https://www.linkedin.com/in/andy-clarke-media/\" target=\"_blank\">Andy Clarke - Data Journalist @ Graphext</a>\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"## Imports\n",
"import math\n",
"import csv\n",
"import time\n",
"import json\n",
"import requests\n",
"from lxml import html"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Scraper set for http://www.trustpilot.com/review/n26.com - saving result to test_n26.csv\n"
]
}
],
"source": [
"## Configurations\n",
"\n",
"# Trustpilot review page\n",
"basePage = 'http://www.trustpilot.com/review/'\n",
"\n",
"# Change the name of your \n",
"reviewSite = 'n26.com'\n",
"reviewPage = basePage + reviewSite\n",
"\n",
"# Data file to save to | will save data row by row\n",
"datafile = 'test_n26.csv'\n",
"\n",
"# Trustpilot default \n",
"resultsPerPage = 20 \n",
"\n",
"print('Scraper set for ' + reviewPage + ' - saving result to ' + datafile)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings\n",
" InsecureRequestWarning)\n"
]
}
],
"source": [
"## Count amount of pages to scrape\n",
"\n",
"# Get page, skipping HTTPS as it gives certificate errors\n",
"page = requests.get(reviewPage, verify=False)\n",
"tree = html.fromstring(page.content)\n",
"\n",
"# Amount of chunks to consider for displaying processing output \n",
"# For ex. 10 means output progress for every 10th of the data\n",
"tot_chunks = 20\n",
"\n",
"# Throttling to avoid spamming page with requests\n",
"# With sleepTime seconds between every page request\n",
"throttle = True\n",
"sleepTime = 1\n",
"\n",
"# Adjust number of pages to scrape (use the number inside the last pagination element)\n",
"pages = 0"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing..\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-39-a024e3f3794e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# Sleep if throttle enabled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0;32mif\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mthrottle\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msleepTime\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreviewPage\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'?page='\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"## Main scraping section\n",
"\n",
"with open(datafile, 'w', newline='', encoding='utf8') as csvfile:\n",
" \n",
" # Tab delimited to allow for special characters\n",
" datawriter = csv.writer(csvfile, delimiter=',')\n",
" datawriter.writerow(['published_date', 'country_code', 'title', 'body', 'rating'])\n",
" print('Processing..')\n",
" for i in range(1,pages+1):\n",
" \n",
" # Sleep if throttle enabled\n",
" if(throttle): time.sleep(sleepTime)\n",
"\n",
" page = requests.get(reviewPage + '?page=' + str(i))\n",
" tree = html.fromstring(page.content)\n",
" \n",
" # Each item below scrapes a pages review titles, bodies and ratings\n",
" \n",
" # This element selector will update\n",
" script_bodies = tree.xpath(\"//script[starts-with(@id, '__NEXT_DATA__')]\")\n",
" for idx,elem in enumerate(script_bodies):\n",
" curr_item = json.loads(elem.text_content())\n",
" \n",
" reviews_list = curr_item['props']['pageProps']['reviews']\n",
" \n",
" # Capture Data\n",
" for rev in reviews_list:\n",
" country_code = rev['consumer']['countryCode']\n",
" published_date = rev['dates']['publishedDate']\n",
" title = rev['title']\n",
" body = rev['text']\n",
" rating = rev['rating']\n",
" \n",
" #print(published_date, country_code, title, body, rating)\n",
" \n",
" datawriter.writerow([published_date, country_code, title, body, rating])\n",
" \n",
" print('Processed ratings.. Finished!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment