Created
January 10, 2021 00:58
-
-
Save suensummit/13d8fa0bf2393a0bcaa123fbb634fca5 to your computer and use it in GitHub Desktop.
Example PCHome crawler script for AWS Lambda
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import requests | |
import bs4 | |
import lxml.etree as xml | |
import time | |
from sqlalchemy import create_engine | |
import pymysql | |
import pymssql | |
def pchome_crawler(keyword): | |
query_res_arr = [] | |
for page_number in range(100): | |
URL = 'https://ecshweb.pchome.com.tw/search/v3.3/all/results?q=' + keyword + '&page=' + str(page_number) | |
res = requests.get(URL) | |
try: | |
query_res = res.json() | |
query_res_arr.append(query_res['prods']) | |
time.sleep(5) | |
except: | |
print(page_number) | |
pass | |
res_arr = [] | |
for page in query_res_arr: | |
res_arr = res_arr + page | |
return res_arr | |
def db_insert(res_arr): | |
df = pd.DataFrame(res_arr) | |
del df['couponActid'] | |
engine = create_engine("mysql+pymysql://admin:[email protected]:3306/test", encoding='utf-8', echo=True) | |
with engine.begin() as connection: | |
df.to_sql('pchome_result', con=connection, if_exists='append') | |
return pass | |
def lambda_handler(event, context): | |
keyword = event['pathParameters']['keyword'] or event['queryString']['keyword'] | |
try: | |
res_arr = pchome_crawler(keyword) | |
db_insert(res_arr) | |
except: | |
pass | |
return { | |
'statusCode': 200, | |
'body': json.dumps('PCHome crawler: '+keyword+'('+len(res_arr)+') success.') | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment