用爬蟲的方式透過 google flights 列出所有機票
用 selenium 爬蟲, 但 google 的 class 都有打亂,所以trace了很久
而且中途會遇到class會變的狀況,觀察了一陣子之後,先click搜尋框才能抓到我要的element,再輸入資料~
終於前進到要的頁面之後,就把html丟到BeautifulSoup裡面 之後再用ajax丟到前端就搞定了!
:D
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>找機票</title> | |
| <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta.2/css/bootstrap.min.css" integrity="sha384-PsH8R72JQ3SOdhVi3uxftmaW6Vc51MKb0q5P2rRUpPvrszuE4W1povHYgTpBfshb" crossorigin="anonymous"> | |
| <!-- JQuery --> | |
| <script src="http://apps.bdimg.com/libs/jquery/1.11.1/jquery.min.js"></script> | |
| <style> | |
| .search{ | |
| padding: 30px; | |
| text-align: center; | |
| } | |
| .box{ | |
| margin-top: 30px; | |
| } | |
| h2{ | |
| font-size: 24px; | |
| color: red; | |
| padding: 0 10px 0 10px; | |
| } | |
| h2.title{ | |
| color:black; | |
| margin-top:20px; | |
| } | |
| h3{ | |
| font-size: 18px; | |
| padding: 10px; | |
| } | |
| h4{ | |
| font-size: 16px; | |
| padding: 10px; | |
| color: green; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="row"> | |
| <div class="col-3"></div> | |
| <div class="col-6"> | |
| <div class="search"> | |
| 從 <input type="text" id="departure"> | |
| 到 <input type="text" id="destination"> | |
| <br> | |
| <button id="submit" type="button" class="btn btn-info" style="float: right;">搜尋</button> | |
| </div> | |
| <div id="output"></div> | |
| </div> | |
| <div class="col-3"></div> | |
| </div> | |
| </body> | |
| <script> | |
| $('#submit').click(function () { | |
| $.ajax( | |
| { | |
| type:"GET", | |
| url: "ajax", | |
| data: { | |
| des:$("#destination").val(), | |
| dep:$("#departure").val(), | |
| }, | |
| success: function(result){ | |
| var text=""; | |
| list_of_tickets = result['tickets']; | |
| for(var i=0;i<list_of_tickets.length;i++){ | |
| console.log('ok'); | |
| text += | |
| "<div class='alert alert-secondary box'>"+ | |
| "<h2>"+list_of_tickets[i]['price']+"</h2>"+ | |
| "<h3>"+list_of_tickets[i]['company']+"</h3>"+ | |
| "<h4>"+list_of_tickets[i]['time']+" ("+list_of_tickets[i]['duration']+")</h4>"+ | |
| "<span class='badge badge-success'>"+list_of_tickets[i]['ticket_type']+"</span>"+ | |
| //"<span class='badge badge-primary'>"+list_of_tickets[i]['flight_type']+"</span>"+ | |
| "</div>"; | |
| } | |
| des = $("#destination").val(); | |
| dep = $("#departure").val(); | |
| if(list_of_tickets.length > 0){ | |
| title = | |
| "<h2 class='title'>"+ | |
| "找到 "+list_of_tickets.length+" 筆從 "+dep+" 到 "+des+" 的機票"+"</h2>"; | |
| $("#output").html(title+text); | |
| }else{ | |
| $("#output").html("<h2 class='title'>找到 0 筆資料,換個關鍵字吧!</h2>"); | |
| } | |
| } | |
| }); | |
| }) | |
| </script> | |
| </html> |
| # main parse code | |
| from selenium import webdriver | |
| from selenium.webdriver.common.keys import Keys | |
| from time import sleep | |
| from bs4 import BeautifulSoup | |
| def from_to(browser, departure, destination): | |
| """ | |
| open 'google.com/flights' default page | |
| then enter departure and destination | |
| return page source we want | |
| """ | |
| browser.get('https://www.google.com/flights/#search') | |
| sleep(0.3) | |
| keyIn(browser, departure, destination) | |
| sleep(0.3) | |
| return browser.page_source | |
| def keyIn(browser, place1, place2): | |
| """ | |
| 因為輸入時div的class會改變,所以先click之後才key關鍵字 | |
| """ | |
| # XPATH | |
| search_box_XPATH = "//div[@class='LJV2HGB-Ab-a']" | |
| input_XPATH = "//input[@class='LJV2HGB-Mb-f']" | |
| # click | |
| browser.find_elements_by_xpath(search_box_XPATH)[0].click() | |
| browser.find_elements_by_xpath(input_XPATH)[0].send_keys(place1) | |
| sleep(0.3) | |
| browser.find_elements_by_xpath(input_XPATH)[0].send_keys(Keys.RETURN) | |
| browser.find_elements_by_xpath(search_box_XPATH)[1].click() | |
| browser.find_elements_by_xpath(input_XPATH)[0].send_keys(place2) | |
| sleep(0.3) | |
| browser.find_elements_by_xpath(input_XPATH)[0].send_keys(Keys.RETURN) | |
| def get_tickets(departure, destination): | |
| """ | |
| get tickets from 'google.com/flights' through selenium | |
| return a list of Tickets | |
| """ | |
| browser = webdriver.Chrome() | |
| html = from_to(browser, departure, destination) | |
| browser.quit() | |
| bs = BeautifulSoup(html, 'html.parser') | |
| tickets = [] | |
| for ele in bs.find_all('div', 'LJV2HGB-d-W'): | |
| try: | |
| price = ele.find('div', 'LJV2HGB-d-Ab').text # 票價 | |
| ticket_type = ele.find('div', 'LJV2HGB-d-Cb').text # 票種 | |
| time = ele.find('div', 'LJV2HGB-d-Zb').text # 時間 | |
| company = ele.find('div', 'LJV2HGB-d-j').text # 航空公司 | |
| duration = ele.find('div', 'LJV2HGB-d-E').text # 飛行時間 | |
| flight_type = ele.find('div', 'LJV2HGB-d-Qb').text # 航班資訊(直達/轉機) | |
| except: | |
| continue | |
| ticket = { | |
| 'price': price, | |
| 'ticket_type': ticket_type, | |
| 'time': time, | |
| 'company': company, | |
| 'duration': duration, | |
| 'flight_type': flight_type, | |
| } | |
| tickets.append(ticket) | |
| return tickets |
| # djangoapp views | |
| from django.shortcuts import render | |
| from django.http import JsonResponse | |
| from .parse import get_tickets | |
| def index(request): | |
| return render(request, 'plane/index.html', {}) | |
| def ajax(request): | |
| departure = request.GET['dep'] | |
| destination = request.GET['des'] | |
| data = { | |
| 'tickets':get_tickets(departure, destination) | |
| } | |
| return JsonResponse(data) |