Created
September 16, 2018 19:31
-
-
Save mrizwan47/c7ebe55da843501fe62522579577ff68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Scraping From Amazon and hitting JungleScout API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import requests\n", | |
"import re\n", | |
"import urllib.parse\n", | |
"from IPython.core.display import display, HTML" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Import Excel File" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Your first 5 rows look like this:\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>ASIN</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>B0014CHDYO</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>B072PCLQDK</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>B00076HTH4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>B00KCNUJ24</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>B000BWP6HY</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ASIN\n", | |
"0 B0014CHDYO\n", | |
"1 B072PCLQDK\n", | |
"2 B00076HTH4\n", | |
"3 B00KCNUJ24\n", | |
"4 B000BWP6HY" | |
] | |
}, | |
"execution_count": 46, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Change this:\n", | |
"upload_file_name = '19905.xlsx'\n", | |
"\n", | |
"\n", | |
"data = pd.read_excel('data/{}'.format(upload_file_name))\n", | |
"print( \"Your first 5 rows look like this:\" )\n", | |
"data.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Request Daily Token" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Your daily token is: FkAm2B8Tw7yvGnN8j3um8A==\n" | |
] | |
} | |
], | |
"source": [ | |
"token_req = requests.get(url='https://junglescoutpro.herokuapp.com/api/v1/users/[email protected]').json()\n", | |
"token = token_req['daily_token']\n", | |
"\n", | |
"print('Your daily token is: {}'.format(token))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Process" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"B0014CHDYO: 60\n", | |
"B072PCLQDK: 368\n", | |
"B00076HTH4: 50\n", | |
"B00KCNUJ24: 5\n", | |
"B000BWP6HY: < 5\n", | |
"B00DFVNWXS: < 5\n", | |
"B008JQF9I0: < 5\n", | |
"B00WI1CXKY: < 5\n", | |
"B06XWBC5H1: < 5\n", | |
"B00A3PHR94: < 5\n", | |
"B00UG0Z892: < 5\n", | |
"B01AYOKMP4: < 5\n", | |
"B00DWGFW52: < 5\n", | |
"B00DHCBR7I: < 5\n", | |
"B002XV9JLU: < 5\n", | |
"B002UUV2DW: 13645\n", | |
"B007XO02IQ: 18\n", | |
"B00BMCGM2K: < 5\n", | |
"B00LRN42WG: FAILED, sales rank not found\n", | |
"B00I9SI40S: 3156\n", | |
"B01KMC5C2G: < 5\n", | |
"B078RSPRMJ: FAILED, sales rank not found\n", | |
"B00CJQ5ASK: 9\n", | |
"B00PWA83LQ: < 5\n", | |
"B00YFZ8LKC: < 5\n", | |
"B000O8MZQK: < 5\n", | |
"B01KMC5BZE: < 5\n", | |
"B0006G4Z5Y: < 5\n", | |
"B00JA55MNG: 8\n", | |
"B0085EYBPI: < 5\n", | |
"B000255PBC: 90\n", | |
"B000EY4YLS: 3649\n", | |
"B000Q5NDHG: 40\n", | |
"B01C4K7N8A: < 5\n", | |
"B01N3Q6IDH: < 5\n", | |
"B0017JG296: < 5\n", | |
"B0021L8XBO: 48\n", | |
"B002SX5H00: < 5\n", | |
"B004YCJPT2: 301\n", | |
"B009C06HWO: < 5\n", | |
"B00AFBKZUO: 76\n", | |
"B00CJNGKO6: 68\n", | |
"B00M6SYB6S: < 5\n", | |
"B00TYZHUKA: 137\n", | |
"B015690284: < 5\n", | |
"B017QM2OPS: 59\n", | |
"B01AT0VSSI: 60\n", | |
"B01B45AMIO: 40\n", | |
"B01N0SNRLJ: 30\n", | |
"B009P4QGLO: < 5\n", | |
"B00B35JA3E: < 5\n", | |
"B01D9OC4SO: < 5\n", | |
"B01N3VVUK7: < 5\n", | |
"B0713PVDVB: < 5\n", | |
"B008Y1PJBW: 963\n", | |
"B00B3RQCGA: 88\n", | |
"B01IJZFZ5W: < 5\n", | |
"B0050OQ3AW: < 5\n", | |
"B0089VTAFS: < 5\n", | |
"B00DFVOA52: < 5\n", | |
"B00JSMWE7I: < 5\n", | |
"B00K0PQII8: 2331\n", | |
"B01GKE273W: < 5\n", | |
"B06WW8WHD3: < 5\n", | |
"B004HIM5AO: < 5\n", | |
"B007ROC212: < 5\n", | |
"B00HC8XCD0: < 5\n", | |
"B00JSMVMXK: < 5\n", | |
"B00RA1H92Y: 18\n", | |
"B016ZZ6ER6: < 5\n", | |
"B0006G53ZA: < 5\n", | |
"B00EZM8O52: < 5\n", | |
"B01M4J5B4S: < 5\n", | |
"B01MXGTJ63: < 5\n", | |
"B01N06D3VR: < 5\n", | |
"B0006N9YNK: < 5\n", | |
"B0035HB75C: < 5\n", | |
"B004LO7MNO: 81\n", | |
"B0053AUWYQ: < 5\n", | |
"B01L1YHG2I: < 5\n", | |
"B01N139898: < 5\n", | |
"B06VXY29ZK: < 5\n", | |
"B06WD8137L: < 5\n", | |
"B06X3ZY21K: < 5\n", | |
"B06X94VWLX: N.A.\n", | |
"B0002AQW64: < 5\n", | |
"B001ATBBUS: 30\n", | |
"B0035H5HSU: < 5\n", | |
"B00C6UXR0W: < 5\n", | |
"B01M97ROYT: < 5\n", | |
"B06VXY29NG: < 5\n", | |
"B000255N0A: 443\n", | |
"B0002DH3EG: < 5\n", | |
"B000JKO1T2: 8\n", | |
"B007CE4FPI: < 5\n", | |
"B00A27W2YI: < 5\n", | |
"B00CJQ3PWI: 30\n", | |
"B00CWHKJFU: < 5\n", | |
"B00D04REDE: < 5\n", | |
"B00T85F1J8: < 5\n", | |
"B01N0WQ9S9: < 5\n", | |
"B06VY5DXY7: < 5\n", | |
"B0018CJNDO: < 5\n", | |
"B009VBTZUK: < 5\n", | |
"B00CJQ5FYO: < 5\n", | |
"B016YXFMO0: < 5\n", | |
"B004HIM51S: < 5\n", | |
"B004WXPVF0: < 5\n", | |
"B009VBQD5A: < 5\n", | |
"B00YJLAN9E: < 5\n", | |
"B010UNGW94: FAILED, sales rank not found\n", | |
"B016YXFIRQ: FAILED, sales rank not found\n", | |
"B01GKE1MC4: < 5\n", | |
"B01GKTFELE: < 5\n", | |
"B06VY5F42N: N.A.\n", | |
"B06WVGV3N2: < 5\n", | |
"B0002AQ0N4: < 5\n", | |
"B000R4KWNE: 60\n", | |
"B0011Z67KU: < 5\n", | |
"B003DASH9U: 502\n", | |
"B005GW8DJC: < 5\n", | |
"B005ZSHUFK: < 5\n", | |
"B00BCYGJHG: < 5\n", | |
"B00BCYGNRC: < 5\n", | |
"B00CJQ3LI6: < 5\n", | |
"B00JSMX5XA: 10\n", | |
"B018K7ICJ0: < 5\n", | |
"B01IJZFYYY: < 5\n", | |
"B01M62HRLH: < 5\n", | |
"B0002DI65G: < 5\n", | |
"B0002DI8CW: < 5\n", | |
"B0032GEEJ2: < 5\n", | |
"B00CJQ3SWU: 22\n", | |
"B00JSMWCF2: < 5\n", | |
"B00YJL29PU: < 5\n", | |
"B01EMUX2X0: 125\n", | |
"B01M21E7MY: < 5\n", | |
"B072R1VD16: < 5\n", | |
"B0002AQIA4: < 5\n", | |
"B0002AT14O: 28\n", | |
"B0002DJZX8: 1246\n", | |
"B003OCXQDE: 423\n", | |
"B00I4YWY9O: < 5\n", | |
"B0035HB8YC: < 5\n", | |
"B00CJQ3P3C: < 5\n", | |
"B00494JWY8: 2348\n", | |
"Ready to save!\n" | |
] | |
} | |
], | |
"source": [ | |
"cols = ['ASIN', 'Estimated Sales']\n", | |
"lst = []\n", | |
"for asin in data['ASIN']:\n", | |
" if not \"Not found on Amazon\" == asin:\n", | |
" product_url = 'https://www.amazon.com/dp/{}'.format(asin)\n", | |
" page_content = requests.get(url=product_url).content\n", | |
" m = re.search('SalesRank.*?\\#([0-9,]+)\\s+[a-z]+\\s+(.*?)\\s+\\(', str(page_content))\n", | |
" if m:\n", | |
" salesrank = m.groups()[0].replace(',', '')\n", | |
" category = urllib.parse.quote(m.groups()[1])\n", | |
" estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
" lst.append([asin, estSalesReq['estSalesResult']])\n", | |
" print( '{}: {}'.format(asin, estSalesReq['estSalesResult']))\n", | |
" else:\n", | |
" \n", | |
" rank_cat_req = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&dailyToken={}\".format(asin, token), headers={'referer': product_url}).json() \n", | |
" if not 'rank' in rank_cat_req.keys():\n", | |
" lst.append([asin, 'FAILED, sales rank not found'])\n", | |
" print( '{}: FAILED, sales rank not found'.format(asin))\n", | |
" else:\n", | |
" salesrank = rank_cat_req['rank']\n", | |
" category = urllib.parse.quote(rank_cat_req['category']) \n", | |
" estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
" \n", | |
" lst.append([asin, estSalesReq['estSalesResult']])\n", | |
" print( '{}: {}'.format(asin, estSalesReq['estSalesResult']))\n", | |
"print( 'Ready to save!' )\n", | |
"new_df = pd.DataFrame(lst, columns=['ASIN', 'Estimated Sales'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Save CSV" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<a target=\"_blank\" href=\"data/Saved CSVs/test.csv\">Download File</a>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# Change this (existing file will be replaced)\n", | |
"save_file_name = 'test.csv'\n", | |
"\n", | |
"\n", | |
"new_df.to_csv('data/Saved CSVs/{}'.format(save_file_name), index=False)\n", | |
"display(HTML('<a target=\"_blank\" href=\"data/Saved CSVs/{}\">Download File</a>'.format(save_file_name)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"# Single ASIN Check" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"B0002DJZX8: 1243\n" | |
] | |
} | |
], | |
"source": [ | |
"# Change this:\n", | |
"asin_number = \"B0002DJZX8\"\n", | |
"\n", | |
"\n", | |
"if asin_number:\n", | |
" product_url = 'https://www.amazon.com/dp/{}'.format(asin_number)\n", | |
" page_content = requests.get(url=product_url).content\n", | |
" m = re.search('SalesRank.*?\\#([0-9,]+)\\s+[a-z]+\\s+(.*?)\\s+\\(', str(page_content))\n", | |
" if m:\n", | |
" salesrank = m.groups()[0].replace(',', '')\n", | |
" category = urllib.parse.quote(m.groups()[1])\n", | |
" estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin_number, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
" print( '{}: {}'.format(asin_number, estSalesReq['estSalesResult']))\n", | |
" else:\n", | |
" \n", | |
" rank_cat_req = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&dailyToken={}\".format(asin_number, token), headers={'referer': product_url}).json() \n", | |
" if not 'rank' in rank_cat_req.keys():\n", | |
" print( '{}: FAILED, sales rank not found'.format(asin_number))\n", | |
" else:\n", | |
" salesrank = rank_cat_req['rank']\n", | |
" category = urllib.parse.quote(rank_cat_req['category']) \n", | |
" estSalesReq = requests.get(\"https://junglescoutpro.herokuapp.com/api/v1/est_sales?store=us&asin={}&rank={}&category={}&dailyToken={}\".format(asin_number, salesrank, category, token), headers={'referer': product_url}).json()\n", | |
" \n", | |
" print( '{}: {}'.format(asin_number, estSalesReq['estSalesResult']))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment