Skip to content

Instantly share code, notes, and snippets.

docker run -p 3128:3128 scrapinghub/crawlera-headless-proxy -d -a $APIKey -z ^((?!httpbin).)*$
time="2019-10-09T13:27:29Z" level=fatal msg="cannot create an instance of proxy: cannot create CA: invalid certificates: tls: private key does not match public key"
# String I used on the command line is: docker run -p 3128:3128 scrapinghub/crawlera-headless-proxy -d -a $crawleraAPIKey -z "(.*cdn.*)|(.*digicert.*)|(.*google.*)|(.*firefox.*)|(.*services.*)|(.*doubleclick.*)|(.*\.(css|jpg|png|woff|ttf|wof|jpeg|bmp|wav|mp4|ogg|webp))$" --tls-ca-certificate=/etc/ssl/certs/ca-certificates.crt
import re
import logging
import logging
from tenacity import retry
from tenacity import stop_after_attempt
import datetime
import requests
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
import os
import requests
import requests_cache
from bs4 import BeautifulSoup
from multiprocessing import Pool
from db import get_counties
import math
from pprint import pprint
Traceback (most recent call last):
File "land.py", line 72, in <module>
main()
File "land.py", line 64, in main
soups = p.map(get_soups, paginated_urls)
File "/Users/work/.pyenv/versions/3.7.3/lib/python3.7/multiprocessing/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/Users/work/.pyenv/versions/3.7.3/lib/python3.7/multiprocessing/pool.py", line 657, in get
raise self._value
multiprocessing.pool.MaybeEncodingError: Error sending result: '[<!DOCTYPE html>
# Code from http://dangoldin.com/2018/11/16/python-3-and-aiohttp/
import aiohttp
import asyncio
import datetime
timeout = aiohttp.ClientTimeout(total=10)
URLS = [
'http://httpbin.org/delay/1?1',
✘ work@Lances-MacBook-Pro  ~/Dropbox/Projects/pyland   serpErrors ●  python serp.py
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=9 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=10 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=7 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=8 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=5 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=11 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=4 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=3 <class 'str'>
https://www.landwatch.com/default.aspx?ct=R&type=268,6843;5,25;6,742;13,12&pg=6 <class 'str'>
import psycopg2
import psycopg2.extras
from psycopg2.extensions import AsIs
import os
import datetime
def write_listings(listing_dicts):
try:
connection = psycopg2.connect(
from pprint import pprint
json_url_params = {'filterState': {}}
price_bin_jsons = []
counter = 0
for i in range(0, 2):
print('Start', counter)
pprint(price_bin_jsons)
json_url_params['filterState']['price'] = {
# I know this doesn't work, but here's what I want to do
async def gather_object_blocks(list_of_objects):
object_blocks = [list_of_objects[i:i + concurrency_limit]
for i in range(0, len(list_of_objects), concurrency_limit)]
async with aiohttp.ClientSession():
for sub_block in object_blocks:
await asyncio.gather(
for the_object in sub_block:
try: