This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docker run -p 3128:3128 scrapinghub/crawlera-headless-proxy -d -a $APIKey -z ^((?!httpbin).)*$ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
time="2019-10-09T13:27:29Z" level=fatal msg="cannot create an instance of proxy: cannot create CA: invalid certificates: tls: private key does not match public key" | |
# String I used on the command line is: docker run -p 3128:3128 scrapinghub/crawlera-headless-proxy -d -a $crawleraAPIKey -z "(.*cdn.*)|(.*digicert.*)|(.*google.*)|(.*firefox.*)|(.*services.*)|(.*doubleclick.*)|(.*\.(css|jpg|png|woff|ttf|wof|jpeg|bmp|wav|mp4|ogg|webp))$" --tls-ca-certificate=/etc/ssl/certs/ca-certificates.crt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import logging | |
import logging | |
from tenacity import retry | |
from tenacity import stop_after_attempt | |
import datetime | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from selenium import webdriver |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import requests_cache | |
from bs4 import BeautifulSoup | |
from multiprocessing import Pool | |
from db import get_counties | |
import math | |
from pprint import pprint |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Traceback (most recent call last): | |
File "land.py", line 72, in <module> | |
main() | |
File "land.py", line 64, in main | |
soups = p.map(get_soups, paginated_urls) | |
File "/Users/work/.pyenv/versions/3.7.3/lib/python3.7/multiprocessing/pool.py", line 268, in map | |
return self._map_async(func, iterable, mapstar, chunksize).get() | |
File "/Users/work/.pyenv/versions/3.7.3/lib/python3.7/multiprocessing/pool.py", line 657, in get | |
raise self._value | |
multiprocessing.pool.MaybeEncodingError: Error sending result: '[<!DOCTYPE html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Code from http://dangoldin.com/2018/11/16/python-3-and-aiohttp/ | |
import aiohttp | |
import asyncio | |
import datetime | |
timeout = aiohttp.ClientTimeout(total=10) | |
URLS = [ | |
'http://httpbin.org/delay/1?1', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
✘ work@Lances-MacBook-Pro ~/Dropbox/Projects/pyland serpErrors ● python serp.py | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=9 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=10 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=7 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=8 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=5 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=11 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=4 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=3 <class 'str'> | |
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=6 <class 'str'> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import psycopg2 | |
import psycopg2.extras | |
from psycopg2.extensions import AsIs | |
import os | |
import datetime | |
def write_listings(listing_dicts): | |
try: | |
connection = psycopg2.connect( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint | |
json_url_params = {'filterState': {}} | |
price_bin_jsons = [] | |
counter = 0 | |
for i in range(0, 2): | |
print('Start', counter) | |
pprint(price_bin_jsons) | |
json_url_params['filterState']['price'] = { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I know this doesn't work, but here's what I want to do | |
async def gather_object_blocks(list_of_objects): | |
object_blocks = [list_of_objects[i:i + concurrency_limit] | |
for i in range(0, len(list_of_objects), concurrency_limit)] | |
async with aiohttp.ClientSession(): | |
for sub_block in object_blocks: | |
await asyncio.gather( | |
for the_object in sub_block: | |
try: |
OlderNewer