Skip to content

Instantly share code, notes, and snippets.

@ijharulislam
Created September 12, 2017 09:02
Show Gist options
  • Select an option

  • Save ijharulislam/f21dcdc808729562c101c65b5eeb4c5e to your computer and use it in GitHub Desktop.

Select an option

Save ijharulislam/f21dcdc808729562c101c65b5eeb4c5e to your computer and use it in GitHub Desktop.
import urllib.parse
import urllib.request
import json
input_data = {
'format': [
{'item_name': 'name', 'xpath': '//*[@id="storeMap"]/header[3]/div/h2/text()', 'regex_match': 'Null', 'regex_item': 'Null', 'remove_tag': ''},
{'item_name': 'zip', 'xpath': '//*[@id="storeMap"]/section/div/div[1]/dl/dd[1]', 'regex_match': '〒([\\d\\-]+)', 'regex_item': '$1 ', 'remove_tag': ''},
# {'item_name': 'address', 'xpath': '//*[@id="storeMap"]/section/div/div[1]/dl/dd[1]', 'regex_match': '〒[\\d\\-]+<br>([^<]+)', 'regex_item': '$1 ', 'remove_tag': ''},
{'item_name': 'tel', 'xpath': '//*[@id="storeMap"]/section/div/div[1]/dl/dd[2]/text()', 'regex_match': 'Null', 'regex_item': 'Null', 'remove_tag': ''},
{'item_name': 'hour', 'xpath': '//*[@id="storeMap"]/section/div/div[1]/dl/dd[3]', 'regex_match': 'Null', 'regex_item': 'Null', 'remove_tag': '1'},
{'item_name': 'name', 'xpath': '//*[@id="storeMap"]/header[3]/div/h2/text()', 'regex_match': 'Null', 'regex_item': 'Null', 'remove_tag': ''}
],
'target': [
{'id': '1', 'url': 'https://www.burgerkingjapan.co.jp/stores/detail.html?sn=89'},
{'id': '2', 'url': 'https://www.burgerkingjapan.co.jp/stores/detail.html?sn=124'},
]
}
apiurl = "http://138.68.241.86:9080/crawl.json?spider_name=shop_info&url=http://www.dmoz.org/Computers/Programming/Languages/Ada&start_requests=true"
input_data = json.dumps(input_data)
input_data_values = urllib.parse.urlencode({ "input_data" : input_data })
full_url = apiurl + '&' + input_data_values
print(full_url)
with urllib.request.urlopen(full_url) as response:
html = response.read()
print (html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment