Created
September 16, 2018 19:31
-
-
Save mrizwan47/c7ebe55da843501fe62522579577ff68 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Scraping From Amazon and hitting JungleScout API" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import requests\n", | |
| "import re\n", | |
| "import urllib.parse\n", | |
| "from IPython.core.display import display, HTML" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Import Excel File" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Your first 5 rows look like this:\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style>\n", | |
| " .dataframe thead tr:only-child th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>ASIN</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>B0014CHDYO</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>B072PCLQDK</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>B00076HTH4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>B00KCNUJ24</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>B000BWP6HY</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " ASIN\n", | |
| "0 B0014CHDYO\n", | |
| "1 B072PCLQDK\n", | |
| "2 B00076HTH4\n", | |
| "3 B00KCNUJ24\n", | |
| "4 B000BWP6HY" | |
| ] | |
| }, | |
| "execution_count": 46, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# Change this:\n", | |
| "upload_file_name = '19905.xlsx'\n", | |
| "\n", | |
| "\n", | |
| "data = pd.read_excel('data/{}'.format(upload_file_name))\n", | |
| "print( \"Your first 5 rows look like this:\" )\n", | |
| "data.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Request Daily Token" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Your daily token is: FkAm2B8Tw7yvGnN8j3um8A==\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "token_req = requests.get(url='https://junglescoutpro.herokuapp.com/api/v1/users/[email protected]').json()\n", | |
| "token = token_req['daily_token']\n", | |
| "\n", | |
| "print('Your daily token is: {}'.format(token))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Process" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "B0014CHDYO: 60\n", | |
| "B072PCLQDK: 368\n", | |
| "B00076HTH4: 50\n", | |
| "B00KCNUJ24: 5\n", | |
| "B000BWP6HY: < 5\n", | |
| "B00DFVNWXS: < 5\n", | |
| "B008JQF9I0: < 5\n", | |
| "B00WI1CXKY: < 5\n", | |
| "B06XWBC5H1: < 5\n", | |
| "B00A3PHR94: < 5\n", | |
| "B00UG0Z892: < 5\n", | |
| "B01AYOKMP4: < 5\n", | |
| "B00DWGFW52: < 5\n", | |
| "B00DHCBR7I: < 5\n", | |
| "B002XV9JLU: < 5\n", | |
| "B002UUV2DW: 13645\n", | |
| "B007XO02IQ: 18\n", | |
| "B00BMCGM2K: < 5\n", | |
| "B00LRN42WG: FAILED, sales rank not found\n", | |
| "B00I9SI40S: 3156\n", | |
| "B01KMC5C2G: < 5\n", | |
| "B078RSPRMJ: FAILED, sales rank not found\n", | |
| "B00CJQ5ASK: 9\n", | |
| "B00PWA83LQ: < 5\n", | |
| "B00YFZ8LKC: < 5\n", | |
| "B000O8MZQK: < 5\n", | |
| "B01KMC5BZE: < 5\n", | |
| "B0006G4Z5Y: < 5\n", | |
| "B00JA55MNG: 8\n", | |
| "B0085EYBPI: < 5\n", | |
| "B000255PBC: 90\n", | |
| "B000EY4YLS: 3649\n", | |
| "B000Q5NDHG: 40\n", | |
| "B01C4K7N8A: < 5\n", | |
| "B01N3Q6IDH: < 5\n", | |
| "B0017JG296: < 5\n", | |
| "B0021L8XBO: 48\n", | |
| "B002SX5H00: < 5\n", | |
| "B004YCJPT2: 301\n", | |
| "B009C06HWO: < 5\n", | |
| "B00AFBKZUO: 76\n", | |
| "B00CJNGKO6: 68\n", | |
| "B00M6SYB6S: < 5\n", | |
| "B00TYZHUKA: 137\n", | |
| "B015690284: < 5\n", | |
| "B017QM2OPS: 59\n", | |
| "B01AT0VSSI: 60\n", | |
| "B01B45AMIO: 40\n", | |
| "B01N0SNRLJ: 30\n", | |
| "B009P4QGLO: < 5\n", | |
| "B00B35JA3E: < 5\n", | |
| "B01D9OC4SO: < 5\n", | |
| "B01N3VVUK7: < 5\n", | |
| "B0713PVDVB: < 5\n", | |
| "B008Y1PJBW: 963\n", | |
| "B00B3RQCGA: 88\n", | |
| "B01IJZFZ5W: < 5\n", | |
| "B0050OQ3AW: < 5\n", | |
| "B0089VTAFS: < 5\n", | |
| "B00DFVOA52: < 5\n", | |
| "B00JSMWE7I: < 5\n", | |
| "B00K0PQII8: 2331\n", | |
| "B01GKE273W: < 5\n", | |
| "B06WW8WHD3: < 5\n", | |
| "B004HIM5AO: < 5\n", | |
| "B007ROC212: < 5\n", | |
| "B00HC8XCD0: < 5\n", | |
| "B00JSMVMXK: < 5\n", | |
| "B00RA1H92Y: 18\n", | |
| "B016ZZ6ER6: < 5\n", | |
| "B0006G53ZA: < 5\n", | |
| "B00EZM8O52: < 5\n", | |
| "B01M4J5B4S: < 5\n", | |
| "B01MXGTJ63: < 5\n", | |
| "B01N06D3VR: < 5\n", | |
| "B0006N9YNK: < 5\n", | |
| "B0035HB75C: < 5\n", | |
| "B004LO7MNO: 81\n", | |
| "B0053AUWYQ: < 5\n", | |
| "B01L1YHG2I: < 5\n", | |
| "B01N139898: < 5\n", | |
| "B06VXY29ZK: < 5\n", | |
| "B06WD8137L: < 5\n", | |
| "B06X3ZY21K: < 5\n", | |
| "B06X94VWLX: N.A.\n", | |
| "B0002AQW64: < 5\n", | |
| "B001ATBBUS: 30\n", | |
| "B0035H5HSU: < 5\n", | |
| "B00C6UXR0W: < 5\n", | |
| "B01M97ROYT: < 5\n", | |
| "B06VXY29NG: < 5\n", | |
| "B000255N0A: 443\n", | |
| "B0002DH3EG: < 5\n", | |
| "B000JKO1T2: 8\n", | |
| "B007CE4FPI: < 5\n", | |
| "B00A27W2YI: < 5\n", | |
| "B00CJQ3PWI: 30\n", | |
| "B00CWHKJFU: < 5\n", | |
| "B00D04REDE: < 5\n", | |
| "B00T85F1J8: < 5\n", | |
| "B01N0WQ9S9: < 5\n", | |
| "B06VY5DXY7: < 5\n", | |
| "B0018CJNDO: < 5\n", | |
| "B009VBTZUK: < 5\n", | |
| "B00CJQ5FYO: < 5\n", | |
| "B016YXFMO0: < 5\n", | |
| "B004HIM51S: < 5\n", | |
| "B004WXPVF0: < 5\n", | |
| "B009VBQD5A: < 5\n", | |
| "B00YJLAN9E: < 5\n", | |
| "B010UNGW94: FAILED, sales rank not found\n", | |
| "B016YXFIRQ: FAILED, sales rank not found\n", | |
| "B01GKE1MC4: < 5\n", | |
| "B01GKTFELE: < 5\n", | |
| "B06VY5F42N: N.A.\n", | |
| "B06WVGV3N2: < 5\n", | |
| "B0002AQ0N4: < 5\n", | |
| "B000R4KWNE: 60\n", | |
| "B0011Z67KU: < 5\n", | |
| "B003DASH9U: 502\n", | |
| "B005GW8DJC: < 5\n", | |
| "B005ZSHUFK: < 5\n", | |
| "B00BCYGJHG: < 5\n", | |
| "B00BCYGNRC: < 5\n", | |
| "B00CJQ3LI6: < 5\n", | |
| "B00JSMX5XA: 10\n", | |
| "B018K7ICJ0: < 5\n", | |
| "B01IJZFYYY: < 5\n", | |
| "B01M62HRLH: < 5\n", | |
| "B0002DI65G: < 5\n", | |
| "B0002DI8CW: < 5\n", | |
| "B0032GEEJ2: < 5\n", | |
| "B00CJQ3SWU: 22\n", | |
| "B00JSMWCF2: < 5\n", | |
| "B00YJL29PU: < 5\n", | |
| "B01EMUX2X0: 125\n", | |
| "B01M21E7MY: < 5\n", | |
| "B072R1VD16: < 5\n", | |
| "B0002AQIA4: < 5\n", | |
| "B0002AT14O: 28\n", | |
| "B0002DJZX8: 1246\n", | |
| "B003OCXQDE: 423\n", | |
| "B00I4YWY9O: < 5\n", | |
| "B0035HB8YC: < 5\n", | |
| "B00CJQ3P3C: < 5\n", | |
| "B00494JWY8: 2348\n", | |
| "Ready to save!\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "cols = ['ASIN', 'Estimated Sales']\n", | |
| "lst = []\n", | |
| "for asin in data['ASIN']:\n", | |
| " if not \"Not found on Amazon\" == asin:\n", | |
| " product_url = 'https://www.amazon.com/dp/{}'.format(asin)\n", | |
| " page_content = requests.get(url=product_url).content\n", | |
| " m = re.search('SalesRank.*?\\#([0-9,]+)\\s+[a-z]+\\s+(.*?)\\s+\\(', str(page_content))\n", | |
| " if m:\n", | |
| " salesrank = m.groups()[0].replace(',', '')\n", | |
| " category = urllib.parse.quote(m.groups()[1])\n", | |
| " estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
| " lst.append([asin, estSalesReq['estSalesResult']])\n", | |
| " print( '{}: {}'.format(asin, estSalesReq['estSalesResult']))\n", | |
| " else:\n", | |
| " \n", | |
| " rank_cat_req = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&dailyToken={}\".format(asin, token), headers={'referer': product_url}).json() \n", | |
| " if not 'rank' in rank_cat_req.keys():\n", | |
| " lst.append([asin, 'FAILED, sales rank not found'])\n", | |
| " print( '{}: FAILED, sales rank not found'.format(asin))\n", | |
| " else:\n", | |
| " salesrank = rank_cat_req['rank']\n", | |
| " category = urllib.parse.quote(rank_cat_req['category']) \n", | |
| " estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
| " \n", | |
| " lst.append([asin, estSalesReq['estSalesResult']])\n", | |
| " print( '{}: {}'.format(asin, estSalesReq['estSalesResult']))\n", | |
| "print( 'Ready to save!' )\n", | |
| "new_df = pd.DataFrame(lst, columns=['ASIN', 'Estimated Sales'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Save CSV" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<a target=\"_blank\" href=\"data/Saved CSVs/test.csv\">Download File</a>" | |
| ], | |
| "text/plain": [ | |
| "<IPython.core.display.HTML object>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "# Change this (existing file will be replaced)\n", | |
| "save_file_name = 'test.csv'\n", | |
| "\n", | |
| "\n", | |
| "new_df.to_csv('data/Saved CSVs/{}'.format(save_file_name), index=False)\n", | |
| "display(HTML('<a target=\"_blank\" href=\"data/Saved CSVs/{}\">Download File</a>'.format(save_file_name)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "source": [ | |
| "# Single ASIN Check" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "B0002DJZX8: 1243\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Change this:\n", | |
| "asin_number = \"B0002DJZX8\"\n", | |
| "\n", | |
| "\n", | |
| "if asin_number:\n", | |
| " product_url = 'https://www.amazon.com/dp/{}'.format(asin_number)\n", | |
| " page_content = requests.get(url=product_url).content\n", | |
| " m = re.search('SalesRank.*?\\#([0-9,]+)\\s+[a-z]+\\s+(.*?)\\s+\\(', str(page_content))\n", | |
| " if m:\n", | |
| " salesrank = m.groups()[0].replace(',', '')\n", | |
| " category = urllib.parse.quote(m.groups()[1])\n", | |
| " estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin_number, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
| " print( '{}: {}'.format(asin_number, estSalesReq['estSalesResult']))\n", | |
| " else:\n", | |
| " \n", | |
| " rank_cat_req = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&dailyToken={}\".format(asin_number, token), headers={'referer': product_url}).json() \n", | |
| " if not 'rank' in rank_cat_req.keys():\n", | |
| " print( '{}: FAILED, sales rank not found'.format(asin_number))\n", | |
| " else:\n", | |
| " salesrank = rank_cat_req['rank']\n", | |
| " category = urllib.parse.quote(rank_cat_req['category']) \n", | |
| " estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin_number, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
| " \n", | |
| " print( '{}: {}'.format(asin_number, estSalesReq['estSalesResult']))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment