Created
January 17, 2020 02:32
-
-
Save jorgerance/8d516ee55bbd5977813e269ce1d2e809 to your computer and use it in GitHub Desktop.
[Web scraping with Node.js and PostgreSQL using Puppeteer] #example #node #nodejs #puppeteer #postgresql #postgres #scrpping #crawling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Puppeteer | |
const puppeteer = require('puppeteer'); | |
// PostgreSQL | |
const pg = require('pg'); | |
// Login + PostgreSQL credentials | |
const CREDS = require('./creds'); | |
// Base URL | |
const URL = 'https://<WOOCOMMERCE_FQDN>' | |
const PRODUCT = '<PRODUCT_URL_PATH>' | |
const PRODUCT_URL = URL + 'producto/' + PRODUCT | |
// Project name for debug purposes | |
const PROJECT = URL.split("/")[2]; | |
// Current time in UTC | |
const NOW = new Date().toISOString(). | |
replace(/T/, ' '). // replace T with a space | |
replace(/\..+/, ''); // delete the dot and everything after | |
// Device emulation | |
const iPad = puppeteer.devices['iPad Pro']; | |
// DOM element selectors | |
// Used puppetry in order to find the proper selectors --> https://puppetry.app/ | |
const USERNAME_SELECTOR = '.woocommerce #username'; | |
const PASSWORD_SELECTOR = '.woocommerce #password'; | |
const LOGIN_BUTTON = '.woocommerce > .woodmart-registration-page > .login > .form-row > .button'; | |
const PRICE_SELECTOR = '#product-2299 > div:nth-child(1) > div.row.product-image-summary-wrap > div > div > div.col-lg-8.col-12.col-md-6.woodmart-price-outside.summary.entry-summary > div > p.price > span'; | |
// Screenshot patsh | |
const SS_PATH = 'screenshots' | |
const SS_URL = '1_homePage.png' | |
const SS_LOGINFORM = '2_loginForm.png' | |
const SS_PRELOGIN = '3_preLogin.png' | |
const SS_POSTLOGIN = '4_postLogin.png' | |
const SS_PRODUCTPAGE = '5_productPage.png' | |
// declare a new client instance from Pool() | |
const pool = new pg.Pool({ | |
user: CREDS.pg_user, | |
host: CREDS.pg_host, | |
database: CREDS.pg_database, | |
password: CREDS.pg_password, | |
port: CREDS.pg_port | |
}) | |
// String for the PostgreSQL table name | |
const tableName = 'ecommerce_prices' | |
async function run() { | |
console.log("\n\n Running " + PROJECT + " - " + NOW + " UTC"); | |
console.log(" ===================================================\n"); | |
// Vieport size | |
// const width = 1024; | |
// const height = 1600; <-- Replaced by device definition | |
const browser = await puppeteer.launch({ | |
headless: true, | |
devtools: false | |
// defaultViewport: { 'width': width, 'height': height } <-- Replaced by device definition | |
}); | |
// Start browser | |
const page = await browser.newPage(); | |
await page.emulate(iPad); | |
// await page.setViewport({ 'width': width, 'height': height }); <-- Replaced by device definition | |
console.log(' - Browser opened'); | |
// Load URL page | |
await page.goto(URL, { waitUntil: 'domcontentloaded' }); | |
console.log(' - URL opened'); | |
// Screenshot: URL | |
await page.screenshot({ path: SS_PATH + '/' + SS_URL }); | |
console.log(" - Screenshot: " + SS_PATH + '/' + SS_URL); | |
// Wait for "MI CUENTA" selectos to appear in viewport | |
await page.waitForSelector("#menu-item-4920") | |
await page.evaluate(() => { | |
document.querySelector('#menu-item-4920 > a:nth-child(1)').click(); | |
}); | |
console.log(" - Selector '#menu-item-4920' found"); | |
await page.waitForNavigation(); | |
// Screenshot: loginForm | |
await page.screenshot({ path: SS_PATH + '/' + SS_LOGINFORM }); | |
console.log(" - Screenshot: " + SS_PATH + '/' + SS_LOGINFORM); | |
// Enter credentials | |
await page.waitForSelector('input[type="text"]'); | |
await page.waitForSelector(USERNAME_SELECTOR) | |
await page.click(USERNAME_SELECTOR) | |
await page.keyboard.type(CREDS.username); | |
console.log(" - Input: username"); | |
await page.waitForSelector(PASSWORD_SELECTOR) | |
await page.click(PASSWORD_SELECTOR) | |
await page.keyboard.type(CREDS.password); | |
console.log(" - Input: password"); | |
// Screenshot: loginForm before submiting | |
await page.screenshot({ path: SS_PATH + '/' + SS_PRELOGIN }); | |
console.log(" - Screenshot: " + SS_PATH + '/' + SS_PRELOGIN); | |
await page.waitForSelector(LOGIN_BUTTON) | |
await page.click(LOGIN_BUTTON) | |
console.log(" - Input: login button"); | |
await page.waitForSelector('#my-account-menu-tab > div > div.user-info > p') | |
// Screenshot: Viewport after login | |
await page.screenshot({ path: SS_PATH + '/' + SS_POSTLOGIN }); | |
console.log(" - Screenshot: " + SS_PATH + '/' + SS_POSTLOGIN); | |
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded' }); | |
const product_price = await page.$eval(PRICE_SELECTOR, el => el.innerText.replace('€', '').replace('.', ',')) | |
console.log(" - Product price: " + product_price + " €") | |
// Screenshot: Product page | |
await page.screenshot({ path: SS_PATH + '/' + SS_PRODUCTPAGE }); | |
console.log(" - Screenshot: " + SS_PATH + '/' + SS_PRODUCTPAGE); | |
//const client = await Pool.connect | |
console.log(" - PostgreSQL: Connected to " + CREDS.pg_database + "@" + CREDS.pg_host) | |
const data_price = product_price.replace(',', '.') | |
const values = [ | |
data_price, | |
PRODUCT | |
]; | |
console.log(" - PostgreSQL: Trying to insert product '" + PRODUCT + "' with price " + data_price + " €") | |
let sqlString = ` | |
INSERT INTO ${tableName} | |
(price, url) | |
VALUES | |
($1, $2)` | |
// Pass the string and array to the pool's query() method | |
pool.query(sqlString, values, (err, res) => { | |
if (err) { | |
console.log('pool.query():', err) | |
} | |
if (res) { | |
console.log('pool.query():', res) | |
} | |
}) | |
console.log(" - PostgreSQL: Inserted '" + PRODUCT + "' with price: " + data_price + " €") | |
pool.end() | |
console.log(" - PostgreSQL: Connection closed") | |
browser.close(); | |
console.log("\n >> END\n"); | |
console.log("\n ================================================\n"); | |
} | |
run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment