用爬蟲的方式透過 google flights 列出所有機票
用 selenium 爬蟲, 但 google 的 class 都有打亂,所以trace了很久
而且中途會遇到class會變的狀況,觀察了一陣子之後,先click搜尋框才能抓到我要的element,再輸入資料~
終於前進到要的頁面之後,就把html丟到BeautifulSoup裡面 之後再用ajax丟到前端就搞定了!
:D
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<title>找機票</title> | |
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta.2/css/bootstrap.min.css" integrity="sha384-PsH8R72JQ3SOdhVi3uxftmaW6Vc51MKb0q5P2rRUpPvrszuE4W1povHYgTpBfshb" crossorigin="anonymous"> | |
<!-- JQuery --> | |
<script src="http://apps.bdimg.com/libs/jquery/1.11.1/jquery.min.js"></script> | |
<style> | |
.search{ | |
padding: 30px; | |
text-align: center; | |
} | |
.box{ | |
margin-top: 30px; | |
} | |
h2{ | |
font-size: 24px; | |
color: red; | |
padding: 0 10px 0 10px; | |
} | |
h2.title{ | |
color:black; | |
margin-top:20px; | |
} | |
h3{ | |
font-size: 18px; | |
padding: 10px; | |
} | |
h4{ | |
font-size: 16px; | |
padding: 10px; | |
color: green; | |
} | |
</style> | |
</head> | |
<body> | |
<div class="row"> | |
<div class="col-3"></div> | |
<div class="col-6"> | |
<div class="search"> | |
從 <input type="text" id="departure"> | |
到 <input type="text" id="destination"> | |
<br> | |
<button id="submit" type="button" class="btn btn-info" style="float: right;">搜尋</button> | |
</div> | |
<div id="output"></div> | |
</div> | |
<div class="col-3"></div> | |
</div> | |
</body> | |
<script> | |
$('#submit').click(function () { | |
$.ajax( | |
{ | |
type:"GET", | |
url: "ajax", | |
data: { | |
des:$("#destination").val(), | |
dep:$("#departure").val(), | |
}, | |
success: function(result){ | |
var text=""; | |
list_of_tickets = result['tickets']; | |
for(var i=0;i<list_of_tickets.length;i++){ | |
console.log('ok'); | |
text += | |
"<div class='alert alert-secondary box'>"+ | |
"<h2>"+list_of_tickets[i]['price']+"</h2>"+ | |
"<h3>"+list_of_tickets[i]['company']+"</h3>"+ | |
"<h4>"+list_of_tickets[i]['time']+" ("+list_of_tickets[i]['duration']+")</h4>"+ | |
"<span class='badge badge-success'>"+list_of_tickets[i]['ticket_type']+"</span>"+ | |
//"<span class='badge badge-primary'>"+list_of_tickets[i]['flight_type']+"</span>"+ | |
"</div>"; | |
} | |
des = $("#destination").val(); | |
dep = $("#departure").val(); | |
if(list_of_tickets.length > 0){ | |
title = | |
"<h2 class='title'>"+ | |
"找到 "+list_of_tickets.length+" 筆從 "+dep+" 到 "+des+" 的機票"+"</h2>"; | |
$("#output").html(title+text); | |
}else{ | |
$("#output").html("<h2 class='title'>找到 0 筆資料,換個關鍵字吧!</h2>"); | |
} | |
} | |
}); | |
}) | |
</script> | |
</html> |
# main parse code | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from time import sleep | |
from bs4 import BeautifulSoup | |
def from_to(browser, departure, destination): | |
""" | |
open 'google.com/flights' default page | |
then enter departure and destination | |
return page source we want | |
""" | |
browser.get('https://www.google.com/flights/#search') | |
sleep(0.3) | |
keyIn(browser, departure, destination) | |
sleep(0.3) | |
return browser.page_source | |
def keyIn(browser, place1, place2): | |
""" | |
因為輸入時div的class會改變,所以先click之後才key關鍵字 | |
""" | |
# XPATH | |
search_box_XPATH = "//div[@class='LJV2HGB-Ab-a']" | |
input_XPATH = "//input[@class='LJV2HGB-Mb-f']" | |
# click | |
browser.find_elements_by_xpath(search_box_XPATH)[0].click() | |
browser.find_elements_by_xpath(input_XPATH)[0].send_keys(place1) | |
sleep(0.3) | |
browser.find_elements_by_xpath(input_XPATH)[0].send_keys(Keys.RETURN) | |
browser.find_elements_by_xpath(search_box_XPATH)[1].click() | |
browser.find_elements_by_xpath(input_XPATH)[0].send_keys(place2) | |
sleep(0.3) | |
browser.find_elements_by_xpath(input_XPATH)[0].send_keys(Keys.RETURN) | |
def get_tickets(departure, destination): | |
""" | |
get tickets from 'google.com/flights' through selenium | |
return a list of Tickets | |
""" | |
browser = webdriver.Chrome() | |
html = from_to(browser, departure, destination) | |
browser.quit() | |
bs = BeautifulSoup(html, 'html.parser') | |
tickets = [] | |
for ele in bs.find_all('div', 'LJV2HGB-d-W'): | |
try: | |
price = ele.find('div', 'LJV2HGB-d-Ab').text # 票價 | |
ticket_type = ele.find('div', 'LJV2HGB-d-Cb').text # 票種 | |
time = ele.find('div', 'LJV2HGB-d-Zb').text # 時間 | |
company = ele.find('div', 'LJV2HGB-d-j').text # 航空公司 | |
duration = ele.find('div', 'LJV2HGB-d-E').text # 飛行時間 | |
flight_type = ele.find('div', 'LJV2HGB-d-Qb').text # 航班資訊(直達/轉機) | |
except: | |
continue | |
ticket = { | |
'price': price, | |
'ticket_type': ticket_type, | |
'time': time, | |
'company': company, | |
'duration': duration, | |
'flight_type': flight_type, | |
} | |
tickets.append(ticket) | |
return tickets |
# djangoapp views | |
from django.shortcuts import render | |
from django.http import JsonResponse | |
from .parse import get_tickets | |
def index(request): | |
return render(request, 'plane/index.html', {}) | |
def ajax(request): | |
departure = request.GET['dep'] | |
destination = request.GET['des'] | |
data = { | |
'tickets':get_tickets(departure, destination) | |
} | |
return JsonResponse(data) |