Skip to content

Instantly share code, notes, and snippets.

@suensummit
Created January 10, 2021 00:58
Show Gist options
  • Save suensummit/13d8fa0bf2393a0bcaa123fbb634fca5 to your computer and use it in GitHub Desktop.
Save suensummit/13d8fa0bf2393a0bcaa123fbb634fca5 to your computer and use it in GitHub Desktop.
Example PCHome crawler script for AWS Lambda
import numpy as np
import pandas as pd
import requests
import bs4
import lxml.etree as xml
import time
from sqlalchemy import create_engine
import pymysql
import pymssql
def pchome_crawler(keyword):
query_res_arr = []
for page_number in range(100):
URL = 'https://ecshweb.pchome.com.tw/search/v3.3/all/results?q=' + keyword + '&page=' + str(page_number)
res = requests.get(URL)
try:
query_res = res.json()
query_res_arr.append(query_res['prods'])
time.sleep(5)
except:
print(page_number)
pass
res_arr = []
for page in query_res_arr:
res_arr = res_arr + page
return res_arr
def db_insert(res_arr):
df = pd.DataFrame(res_arr)
del df['couponActid']
engine = create_engine("mysql+pymysql://admin:[email protected]:3306/test", encoding='utf-8', echo=True)
with engine.begin() as connection:
df.to_sql('pchome_result', con=connection, if_exists='append')
return pass
def lambda_handler(event, context):
keyword = event['pathParameters']['keyword'] or event['queryString']['keyword']
try:
res_arr = pchome_crawler(keyword)
db_insert(res_arr)
except:
pass
return {
'statusCode': 200,
'body': json.dumps('PCHome crawler: '+keyword+'('+len(res_arr)+') success.')
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment