Skip to content

Instantly share code, notes, and snippets.

@Th3redTea
Created September 20, 2025 12:54
Show Gist options
  • Save Th3redTea/e15226e8f56c6cdde18a0b16e403e7b7 to your computer and use it in GitHub Desktop.
Save Th3redTea/e15226e8f56c6cdde18a0b16e403e7b7 to your computer and use it in GitHub Desktop.

🐍 Python Crawling Libraries Cheatsheet


Library: requests

Use case: Fetch a single page and inspect its HTML.

import requests

# Send a GET request to a page
resp = requests.get("https://example.com")

# Print the HTML of the page
print(resp.text[:200])  # show first 200 characters

Library: httpx

Use case: Fetch multiple pages concurrently with async.

import httpx, asyncio

async def main():
    urls = ["https://example.com", "https://httpbin.org/get"]
    async with httpx.AsyncClient() as client:
        # Send all requests at once
        responses = await asyncio.gather(*[client.get(u) for u in urls])
        for r in responses:
            print(r.url, len(r.text))  # show URL and content length

asyncio.run(main())

Library: aiohttp

Use case: Download a page asynchronously with a session.

import aiohttp, asyncio

async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            # Return the page text
            return await resp.text()

print(asyncio.run(fetch("https://example.com")))

Library: BeautifulSoup4

Use case: Extract all links (<a href>) from a page.

from bs4 import BeautifulSoup

html = "<html><body><a href='/doc'>Docs</a></body></html>"
soup = BeautifulSoup(html, "html.parser")

# Get all href values
links = [a["href"] for a in soup.find_all("a", href=True)]
print(links)  # ['/doc']

Library: lxml

Use case: Use XPath to get all links from HTML.

from lxml import html

page = "<html><body><a href='/x'>X</a><a href='/y'>Y</a></body></html>"
tree = html.fromstring(page)

# Extract href attributes with XPath
links = tree.xpath("//a/@href")
print(links)  # ['/x', '/y']

Library: urllib.parse

Use case: Normalize and analyze URLs.

from urllib.parse import urljoin, urldefrag, urlparse

# Join relative URL with base
print(urljoin("https://site/guide/", "../api"))  # https://site/api

# Remove fragment
print(urldefrag("https://site/page#top")[0])  # https://site/page

# Get only the path
print(urlparse("https://site/path?x=1").path)  # /path

Library: tldextract

Use case: Extract domain parts from a URL.

import tldextract

ext = tldextract.extract("https://sub.docs.example.co.uk")
print(ext.subdomain)  # sub.docs
print(ext.domain)     # example
print(ext.suffix)     # co.uk

Library: urllib.robotparser

Use case: Check if crawling a path is allowed by robots.txt.

from urllib.robotparser import RobotFileParser

rp = RobotFileParser("https://example.com/robots.txt")
rp.read()

# Can my bot fetch this URL?
print(rp.can_fetch("*", "https://example.com/private"))  # True/False

Library: feedparser

Use case: Parse an RSS feed and get links.

import feedparser

feed = feedparser.parse("https://hnrss.org/frontpage")
for entry in feed.entries[:3]:
    print(entry.link)  # print first 3 links

Library: xml.etree.ElementTree

Use case: Parse sitemap.xml and extract <loc> URLs.

import xml.etree.ElementTree as ET

xml = "<urlset><url><loc>https://site/a</loc></url></urlset>"
root = ET.fromstring(xml)

# Find all <loc> tags
urls = [loc.text for loc in root.findall(".//loc")]
print(urls)  # ['https://site/a']

Library: json

Use case: Read a JSON search index from docs.

import json

data = '{"urls":["/a","/b"]}'
parsed = json.loads(data)

# Print all URLs inside
print(parsed["urls"])  # ['/a', '/b']

Library: playwright

Use case: Render a page with JavaScript and grab links.

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    page.goto("https://example.com")
    links = page.eval_on_selector_all("a", "els => els.map(e => e.href)")
    print(links)
    browser.close()

Library: selenium

Use case: Control a browser to get dynamic content.

from selenium import webdriver

driver = webdriver.Chrome()
driver.get("https://example.com")

# Extract all link hrefs
links = [a.get_attribute("href") for a in driver.find_elements("tag name", "a")]
print(links)
driver.quit()

Library: re

Use case: Match only API paths with regex.

import re

rx = re.compile(r"^/api/v1/.*")
print(bool(rx.match("/api/v1/users")))     # True
print(bool(rx.match("/static/js/app.js"))) # False

🐍 Python Built-ins Cheatsheet — Use Cases + Code Examples

A practical, beginner-friendly list of useful Python built-in functions with a quick use case and a tiny code example you can copy.


abs()

Use case: Get absolute value (distance from zero).

print(abs(-42))          # 42

all()

Use case: Check if all items are truthy (e.g., all validations pass).

checks = [x > 0 for x in [1, 2, 3]]
print(all(checks))       # True

any()

Use case: Check if any item is truthy (e.g., any error flags).

flags = [False, 0, "", "hit"]
print(any(flags))        # True

ascii()

Use case: Safe printable representation (escape non-ASCII).

print(ascii("café"))     # 'caf\xe9'

bin()

Use case: Show a number in binary.

print(bin(13))           # '0b1101'

bool()

Use case: Coerce a value to True/False.

print(bool(""))          # False
print(bool("x"))         # True

bytearray() / bytes()

Use case: Mutable / immutable sequences of bytes (I/O, protocols).

b = bytearray(b"ABC"); b[0] = ord('Z'); print(bytes(b))  # b'ZBC'

callable()

Use case: Check if something can be called like a function.

def f(): pass
print(callable(f))       # True

chr() / ord()

Use case: Convert between code point ↔ character.

print(chr(9731))         # '☃'
print(ord('A'))          # 65

complex()

Use case: Work with complex numbers (math, DSP).

z = complex(2, 3)        # 2+3j
print(z.conjugate())     # (2-3j)

dict()

Use case: Create dictionaries / mapping data.

user = dict(name="Tea", role="pentester")
print(user["role"])      # 'pentester'

dir()

Use case: Quick attribute/method discovery (introspection).

print([n for n in dir(str) if "find" in n])  # ['find', 'rfind']

divmod()

Use case: Get quotient and remainder together.

q, r = divmod(17, 5)
print(q, r)              # 3 2

enumerate()

Use case: Loop with indices, cleanly.

for i, item in enumerate(["a", "b", "c"], start=1):
    print(i, item)       # 1 a ...

eval() ⚠️

Use case: Evaluate a Python expression from a trusted string only.

expr = "2 + 3 * 4"
print(eval(expr, {"__builtins__": {}}, {}))  # 14  (sandboxed)

filter()

Use case: Keep items that match a condition.

nums = [1, 2, 3, 4]
evens = list(filter(lambda x: x % 2 == 0, nums))
print(evens)             # [2, 4]

float()

Use case: Convert to floating-point number.

print(float("3.14"))     # 3.14

format()

Use case: Format values (numbers, strings).

print(format(1234.567, ",.2f"))  # '1,234.57'

frozenset()

Use case: Immutable set (hashable, usable as dict key).

key = frozenset({"a", "b"})
d = {key: "value"}
print(d[key])            # 'value'

getattr() / setattr() / hasattr() / delattr()

Use case: Access/modify attributes by name.

class Obj: pass
o = Obj()
setattr(o, "x", 10)
print(getattr(o, "x", None))  # 10
print(hasattr(o, "x"))        # True
delattr(o, "x")

globals() / locals()

Use case: Inspect current variable scopes (debugging, meta-programming).

x = 5
print("x" in globals())  # True in global scope
print("x" in locals())   # True here

hash()

Use case: Get hash of immutable objects (dict keys, sets).

print(hash(("a", 1)))    # stable hash within a run

help()

Use case: Interactive doc helper (in REPL/terminal).

# In Python REPL:
# >>> help(str)

hex()

Use case: Hexadecimal representation.

print(hex(255))          # '0xff'

id()

Use case: Identity (memory-ish) of an object (debug refs).

a = []; b = a
print(id(a) == id(b))    # True

input()

Use case: Read a line from the user (CLI).

name = input("Your name: ")
print("Hi,", name)

int()

Use case: Convert to integer / parse base-N.

print(int("ff", 16))     # 255

isinstance() / issubclass()

Use case: Type checks (safe branching).

print(isinstance("x", str))     # True
print(issubclass(bool, int))    # True

iter() / next()

Use case: Manual iteration / default when exhausted.

it = iter([10, 20])
print(next(it))          # 10
print(next(it, "done"))  # 20 then 'done'

len()

Use case: Size/length of containers.

print(len({"a": 1, "b": 2}))  # 2

list() / tuple()

Use case: Build lists/tuples; convert iterables.

print(list("abc"))       # ['a','b','c']
print(tuple([1,2]))      # (1, 2)

map()

Use case: Apply a function to each item.

nums = [1, 2, 3]
squares = list(map(lambda x: x*x, nums))
print(squares)           # [1, 4, 9]

max() / min()

Use case: Extremes, optionally with a key.

words = ["alpha", "beta", "z"]
print(max(words, key=len))  # 'alpha'

memoryview()

Use case: Zero-copy slices of binary data.

b = bytearray(b"ABCDE")
mv = memoryview(b)[1:4]; mv[0] = ord(b"Z")
print(b)                 # bytearray(b'AZDE')

object()

Use case: Smallest base object (often for sentinels).

sentinel = object()
print(sentinel is object())  # False (new object each time)

oct()

Use case: Octal string of a number.

print(oct(64))           # '0o100'

open()

Use case: Read/write files.

with open("notes.txt", "w", encoding="utf-8") as f:
    f.write("hello\n")

pow()

Use case: Exponentiation (with mod for crypto/math).

print(pow(2, 10))        # 1024
print(pow(3, 4, 5))      # (3**4) % 5 == 1

print()

Use case: Quick output / debugging.

print("a", "b", sep=",", end="!\n")  # a,b!

range()

Use case: Sequence of numbers for loops.

for i in range(1, 5):
    print(i)             # 1..4

repr()

Use case: Unambiguous string representation (debug).

print(repr("x\ny"))     # "'x\ny'"

reversed()

Use case: Iterate backwards.

for c in reversed(["a","b","c"]):
    print(c)             # c b a

round()

Use case: Round numbers (banker’s rounding).

print(round(2.675, 2))   # 2.67 (floating-point nuance)

set()

Use case: Unique items, set ops.

print(set([1,1,2]) | {2,3})  # {1,2,3}

slice()

Use case: Build reusable slice objects.

s = slice(1, None, 2)    # 1::2
print("abcdef"[s])       # 'bdf'

sorted()

Use case: Sort with options (key, reverse).

data = ["aa", "b", "cccc"]
print(sorted(data, key=len, reverse=True))  # ['cccc','aa','b']

str() / int() / float()

Use case: Convert between basic types.

print(str(123), int("7"), float("3.1"))

sum()

Use case: Add numbers (or use start for concatenation).

print(sum([1,2,3]))      # 6
print(sum([[1],[2]], start=[]))  # [1,2]

super()

Use case: Call parent class methods (OOP).

class A: 
    def f(self): return "A"
class B(A):
    def f(self): return "B->" + super().f()
print(B().f())           # 'B->A'

type()

Use case: Get the type of a value.

print(type({}))          # <class 'dict'>

vars()

Use case: Get __dict__ (attributes) of objects.

class C: 
    def __init__(self): self.x=1; self.y=2
print(vars(C()))         # {'x': 1, 'y': 2}

zip()

Use case: Pair items from multiple iterables.

names = ["a","b"]; ages = [10, 20]
print(list(zip(names, ages)))  # [('a',10), ('b',20)]

compile()

Use case: Compile source to a code object; run with exec.

code = compile("x = 2*21", "<mem>", "exec")
scope = {}
exec(code, scope)
print(scope["x"])        # 42

breakpoint()

Use case: Drop into debugger at runtime.

x = 10
breakpoint()   # (runs pdb or configured debugger)

classmethod() / staticmethod() / property()

Use case: Build nicer class APIs.

class T:
    def __init__(self, x): self._x = x
    @property
    def x(self): return self._x        # t.x
    @staticmethod
    def ping(): return "pong"          # T.ping()
    @classmethod
    def make(cls): return cls(42)      # T.make()

t = T.make(); print(t.x, T.ping())

re (module) — quick regex helpers (not a built-in function, but essential)

Use case: Match/replace with patterns.

import re
print(bool(re.match(r"^/api/v1/.*", "/api/v1/users")))  # True
print(re.sub(r"\s+", " ", "a   b   c"))               # 'a b c'

Notes

  • ⚠️ eval()/exec() can be dangerous. Never use with untrusted input.
  • For large data or high concurrency, learn asyncio, httpx.AsyncClient, and queues.
  • Use logging for real projects instead of only print().
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment