Skip to content

Instantly share code, notes, and snippets.

@jorgerance
Created January 17, 2020 02:32
Show Gist options
  • Save jorgerance/8d516ee55bbd5977813e269ce1d2e809 to your computer and use it in GitHub Desktop.
Save jorgerance/8d516ee55bbd5977813e269ce1d2e809 to your computer and use it in GitHub Desktop.
[Web scraping with Node.js and PostgreSQL using Puppeteer] #example #node #nodejs #puppeteer #postgresql #postgres #scrpping #crawling
// Puppeteer
const puppeteer = require('puppeteer');
// PostgreSQL
const pg = require('pg');
// Login + PostgreSQL credentials
const CREDS = require('./creds');
// Base URL
const URL = 'https://<WOOCOMMERCE_FQDN>'
const PRODUCT = '<PRODUCT_URL_PATH>'
const PRODUCT_URL = URL + 'producto/' + PRODUCT
// Project name for debug purposes
const PROJECT = URL.split("/")[2];
// Current time in UTC
const NOW = new Date().toISOString().
replace(/T/, ' '). // replace T with a space
replace(/\..+/, ''); // delete the dot and everything after
// Device emulation
const iPad = puppeteer.devices['iPad Pro'];
// DOM element selectors
// Used puppetry in order to find the proper selectors --> https://puppetry.app/
const USERNAME_SELECTOR = '.woocommerce #username';
const PASSWORD_SELECTOR = '.woocommerce #password';
const LOGIN_BUTTON = '.woocommerce > .woodmart-registration-page > .login > .form-row > .button';
const PRICE_SELECTOR = '#product-2299 > div:nth-child(1) > div.row.product-image-summary-wrap > div > div > div.col-lg-8.col-12.col-md-6.woodmart-price-outside.summary.entry-summary > div > p.price > span';
// Screenshot patsh
const SS_PATH = 'screenshots'
const SS_URL = '1_homePage.png'
const SS_LOGINFORM = '2_loginForm.png'
const SS_PRELOGIN = '3_preLogin.png'
const SS_POSTLOGIN = '4_postLogin.png'
const SS_PRODUCTPAGE = '5_productPage.png'
// declare a new client instance from Pool()
const pool = new pg.Pool({
user: CREDS.pg_user,
host: CREDS.pg_host,
database: CREDS.pg_database,
password: CREDS.pg_password,
port: CREDS.pg_port
})
// String for the PostgreSQL table name
const tableName = 'ecommerce_prices'
async function run() {
console.log("\n\n Running " + PROJECT + " - " + NOW + " UTC");
console.log(" ===================================================\n");
// Vieport size
// const width = 1024;
// const height = 1600; <-- Replaced by device definition
const browser = await puppeteer.launch({
headless: true,
devtools: false
// defaultViewport: { 'width': width, 'height': height } <-- Replaced by device definition
});
// Start browser
const page = await browser.newPage();
await page.emulate(iPad);
// await page.setViewport({ 'width': width, 'height': height }); <-- Replaced by device definition
console.log(' - Browser opened');
// Load URL page
await page.goto(URL, { waitUntil: 'domcontentloaded' });
console.log(' - URL opened');
// Screenshot: URL
await page.screenshot({ path: SS_PATH + '/' + SS_URL });
console.log(" - Screenshot: " + SS_PATH + '/' + SS_URL);
// Wait for "MI CUENTA" selectos to appear in viewport
await page.waitForSelector("#menu-item-4920")
await page.evaluate(() => {
document.querySelector('#menu-item-4920 > a:nth-child(1)').click();
});
console.log(" - Selector '#menu-item-4920' found");
await page.waitForNavigation();
// Screenshot: loginForm
await page.screenshot({ path: SS_PATH + '/' + SS_LOGINFORM });
console.log(" - Screenshot: " + SS_PATH + '/' + SS_LOGINFORM);
// Enter credentials
await page.waitForSelector('input[type="text"]');
await page.waitForSelector(USERNAME_SELECTOR)
await page.click(USERNAME_SELECTOR)
await page.keyboard.type(CREDS.username);
console.log(" - Input: username");
await page.waitForSelector(PASSWORD_SELECTOR)
await page.click(PASSWORD_SELECTOR)
await page.keyboard.type(CREDS.password);
console.log(" - Input: password");
// Screenshot: loginForm before submiting
await page.screenshot({ path: SS_PATH + '/' + SS_PRELOGIN });
console.log(" - Screenshot: " + SS_PATH + '/' + SS_PRELOGIN);
await page.waitForSelector(LOGIN_BUTTON)
await page.click(LOGIN_BUTTON)
console.log(" - Input: login button");
await page.waitForSelector('#my-account-menu-tab > div > div.user-info > p')
// Screenshot: Viewport after login
await page.screenshot({ path: SS_PATH + '/' + SS_POSTLOGIN });
console.log(" - Screenshot: " + SS_PATH + '/' + SS_POSTLOGIN);
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded' });
const product_price = await page.$eval(PRICE_SELECTOR, el => el.innerText.replace('€', '').replace('.', ','))
console.log(" - Product price: " + product_price + " €")
// Screenshot: Product page
await page.screenshot({ path: SS_PATH + '/' + SS_PRODUCTPAGE });
console.log(" - Screenshot: " + SS_PATH + '/' + SS_PRODUCTPAGE);
//const client = await Pool.connect
console.log(" - PostgreSQL: Connected to " + CREDS.pg_database + "@" + CREDS.pg_host)
const data_price = product_price.replace(',', '.')
const values = [
data_price,
PRODUCT
];
console.log(" - PostgreSQL: Trying to insert product '" + PRODUCT + "' with price " + data_price + " €")
let sqlString = `
INSERT INTO ${tableName}
(price, url)
VALUES
($1, $2)`
// Pass the string and array to the pool's query() method
pool.query(sqlString, values, (err, res) => {
if (err) {
console.log('pool.query():', err)
}
if (res) {
console.log('pool.query():', res)
}
})
console.log(" - PostgreSQL: Inserted '" + PRODUCT + "' with price: " + data_price + " €")
pool.end()
console.log(" - PostgreSQL: Connection closed")
browser.close();
console.log("\n >> END\n");
console.log("\n ================================================\n");
}
run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment