This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| let grabPosts = await page.evaluate(() => { | |
| let allPosts = document.body.querySelectorAll('.Post'); | |
| // storing the post items in an array then selecting for retrieving content | |
| scrapeItems = []; | |
| allPosts.forEach(item => { | |
| let postTitle = item.querySelector('h3'); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const puppeteer = require('puppeteer'); | |
| // starting Puppeteer | |
| puppeteer.launch().then(async browser => { | |
| // opening a new page and navigating to Reddit | |
| const page = await browser.newPage(); | |
| await page.goto('https://www.reddit.com/r/scraping/'); | |
| await page.waitForSelector('body'); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| let ip_addresses = []; | |
| let port_numbers = []; | |
| request("https://sslproxies.org/", function(error, response, html) { | |
| if (!error && response.statusCode == 200) { | |
| const $ = cheerio.load(html); | |
| $("td:nth-child(1)").each(function(index, value) { | |
| ip_addresses[index] = $(this).text(); | |
| }); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const request = require("request"); | |
| const cheerio = require("cheerio"); | |
| function proxyGenerator() { | |
| let ip_addresses = []; | |
| let port_numbers = []; | |
| let proxy; | |
| request("https://sslproxies.org/", function(error, response, html) { | |
| if (!error && response.statusCode == 200) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const options = { | |
| url: "https://www.forextradingbig.com/10-facts-you-must-know-on-online-forex-trading/", | |
| method: "GET", | |
| proxy: proxyGenerator() | |
| }; | |
| request(options, function(error, response, html) { | |
| if (!error && response.statusCode == 200) { | |
| const $ = cheerio.load(html); | |
| let article_headings = $("h2").text(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def proxy_generator(): | |
| response = requests.get("https://sslproxies.org/") | |
| soup = BeautifulSoup(response.content, 'html5lib') | |
| proxy = {'https': choice(list(zip(map(lambda x:x.text, soup.findAll('td')[::8]), map(lambda x:x.text, soup.findAll('td')[1::8]))))))} | |
| return proxy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def data_scraper(request_method, url, **kwargs): | |
| while true: | |
| try: | |
| proxy = proxy_generator() | |
| print("Proxy currently being used: {}".format(proxy)) | |
| response = requests.request(request_method, url, proxies=proxy, timeout=7, **kwargs) | |
| break | |
| # if the request is successful, no exception is raised | |
| except: | |
| print("Connection error, looking for another proxy") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| from random import choice | |
| def proxy_generator(): | |
| response = requests.get("https://sslproxies.org/") | |
| soup = BeautifulSoup(response.content, 'html5lib') | |
| proxy = {'https': choice(list(map(lambda x:x[0]+':'+x[1], list(zip(map(lambda x:x.text, soup.findAll('td')[::8]), map(lambda x:x.text, soup.findAll('td')[1::8]))))))} | |
| return proxy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $results = array(); | |
| if (!empty($html)) { | |
| $div_class = $title = ""; | |
| $i = 0; | |
| foreach ($html->find(".review-container") as $div_class) { | |
| //Extract the review title |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function convertToXML($results, &$xml_user_info){ | |
| foreach($results as $key => $value){ | |
| if(is_array($results)){ | |
| $subnode = $xml_user_info->addChild($key); | |
| foreach ($value as $k=>$v) { | |
| $xml_user_info->addChild("$k", $v); | |
| } | |
| }else{ | |
| $xml_user_info->addChild("$key",htmlspecialchars("$value")); | |
| } |