Last active
May 11, 2021 04:50
-
-
Save Enigma10/6dd5a0e791469e46064f2e6cf227e8f6 to your computer and use it in GitHub Desktop.
Regex cheat sheet in python. Regular expressions allow you to locate and change strings in very powerful ways. They work in almost exactly the same way in every programming language as well.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Regular expressions allow you to locate and change | |
# strings in very powerful ways. | |
# They work in almost exactly the same way in every | |
# programming language as well. | |
# Regular Expressions (Regex) are used to | |
# 1. Search for a specific string in a large amount of data | |
# 2. Verify that a string has the proper format (Email, Phone #) | |
# 3. Find a string and replace it with another string | |
# 4. Format data into the proper form for importing for example | |
# import the Regex module | |
import re | |
# ---------- Was a Match Found ---------- | |
# Search for ape in the string | |
if re.search("ape", "The ape was at the apex"): | |
print("There is an ape") | |
# ---------- Get All Matches ---------- | |
# findall() returns a list of matches | |
# . is used to match any 1 character or space | |
allApes = re.findall("ape.", "The ape was at the apex") | |
for i in allApes: | |
print(i) | |
# finditer returns an iterator of matching objects | |
# You can use span to get the location | |
theStr = "The ape was at the apex" | |
for i in re.finditer("ape.", theStr): | |
# Span returns a tuple | |
locTuple = i.span() | |
print(locTuple) | |
# Slice the match out using the tuple values | |
print(theStr[locTuple[0]:locTuple[1]]) | |
# ---------- Match 1 of Several Letters ---------- | |
# Square brackets will match any one of the characters between | |
# the brackets not including upper and lowercase varieties | |
# unless they are listed | |
animalStr = "Cat rat mat fat pat" | |
allAnimals = re.findall("[crmfp]at", animalStr) | |
for i in allAnimals: | |
print(i) | |
print() | |
# We can also allow for characters in a range | |
# Remember to include upper and lowercase letters | |
someAnimals = re.findall("[c-mC-M]at", animalStr) | |
for i in someAnimals: | |
print(i) | |
print() | |
# Use ^ to denote any character but whatever characters are | |
# between the brackets | |
someAnimals = re.findall("[^Cr]at", animalStr) | |
for i in someAnimals: | |
print(i) | |
print() | |
# ---------- Replace All Matches ---------- | |
# Replace matching items in a string | |
owlFood = "rat cat mat pat" | |
# You can compile a regex into pattern objects which | |
# provide additional methods | |
regex = re.compile("[cr]at") | |
# sub() replaces items that match the regex in the string | |
# with the 1st attribute string passed to sub | |
owlFood = regex.sub("owl", owlFood) | |
print(owlFood) | |
# ---------- Solving Backslash Problems ---------- | |
# Regex use the backslash to designate special characters | |
# and Python does the same inside strings which causes | |
# issues. | |
# Let's try to get "\\stuff" out of a string | |
randStr = "Here is \\stuff" | |
# This won't find it | |
print("Find \\stuff : ", re.search("\\stuff", randStr)) | |
# This does, but we have to put in 4 slashes which is | |
# messy | |
print("Find \\stuff : ", re.search("\\\\stuff", randStr)) | |
# You can get around this by using raw strings which | |
# don't treat backslashes as special | |
print("Find \\stuff : ", re.search(r"\\stuff", randStr)) | |
# ---------- Matching Any Character ---------- | |
# We saw that . matches any character, but what if we | |
# want to match a period. Backslash the period | |
# You do the same with [, ] and others | |
randStr = "F.B.I. I.R.S. CIA" | |
print("Matches :", len(re.findall(".\..\..", randStr))) | |
# ---------- Matching Whitespace ---------- | |
# We can match many whitespace characters | |
randStr = """This is a long | |
string that goes | |
on for many lines""" | |
print(randStr) | |
# Remove newlines | |
regex = re.compile("\n") | |
randStr = regex.sub(" ", randStr) | |
print(randStr) | |
# You can also match | |
# \b : Backspace | |
# \f : Form Feed | |
# \r : Carriage Return | |
# \t : Tab | |
# \v : Vertical Tab | |
# You may need to remove \r\n on Windows | |
# ---------- Matching Any Single Number ---------- | |
# \d can be used instead of [0-9] | |
# \D is the same as [^0-9] | |
randStr = "12345" | |
print("Matches :", len(re.findall("\d", randStr))) | |
# ---------- Matching Multiple Numbers ---------- | |
# You can match multiple digits by following the \d with {numOfValues} | |
# Match 5 numbers only | |
if re.search("\d{5}", "12345"): | |
print("It is a zip code") | |
# You can also match within a range | |
# Match values that are between 5 and 7 digits | |
numStr = "123 12345 123456 1234567" | |
print("Matches :", len(re.findall("\d{5,7}", numStr))) | |
# ---------- Matching Any Single Letter or Number ---------- | |
# \w is the same as [a-zA-Z0-9_] | |
# \W is the same as [^a-zA-Z0-9_] | |
phNum = "412-555-1212" | |
# Check if it is a phone number | |
if re.search("\w{3}-\w{3}-\w{4}", phNum): | |
print("It is a phone number") | |
# Check for valid first name between 2 and 20 characters | |
if re.search("\w{2,20}", "Ultraman"): | |
print("It is a valid name") | |
# ---------- Matching WhiteSpace ---------- | |
# \s is the same as [\f\n\r\t\v] | |
# \S is the same as [^\f\n\r\t\v] | |
# Check for valid first and last name with a space | |
if re.search("\w{2,20}\s\w{2,20}", "Toshio Muramatsu"): | |
print("It is a valid full name") | |
# ---------- Matching One or More ---------- | |
# + matches 1 or more characters | |
# Match a followed by 1 or more characters | |
print("Matches :", len(re.findall("a+", "a as ape bug"))) | |
# ---------- Problem ---------- | |
# Create a Regex that matches email addresses from a list | |
# 1. 1 to 20 lowercase and uppercase letters, numbers, plus ._%+- | |
# 2. An @ symbol | |
# 3. 2 to 20 lowercase and uppercase letters, numbers, plus .- | |
# 4. A period | |
# 5. 2 to 3 lowercase and uppercase letters | |
emailList = "[email protected] [email protected] @apple.com [email protected]" | |
print("Email Matches :", len(re.findall("[\w._%+-]{1,20}@[\w.-]{2,20}.[A-Za-z]{2,3}", | |
emailList))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Did you find a match | |
# if re.search("REGEX", yourString) | |
# Get list of matches | |
# print("Matches :", len(re.findall("REGEX", yourString))) | |
# Get a pattern object | |
# regex = re.compile("REGEX") | |
# Substitute the match | |
# yourString = regex.sub("substitution", yourString) | |
# [ ] : Match what is in the brackets | |
# [^ ] : Match anything not in the brackets | |
# . : Match any 1 character or space | |
# + : Match 1 or more of what proceeds | |
# \n : Newline | |
# \d : Any 1 number | |
# \D : Anything but a number | |
# \w : Same as [a-zA-Z0-9_] | |
# \W : Same as [^a-zA-Z0-9_] | |
# \s : Same as [\f\n\r\t\v] | |
# \S : Same as [^\f\n\r\t\v] | |
# {5} : Match 5 of what proceeds the curly brackets | |
# {5,7} : Match values that are between 5 and 7 in length | |
# ---------- Matching Zero or One ---------- | |
randStr = "cat cats" | |
regex = re.compile("[cat]+s?") | |
matches = re.findall(regex, randStr) | |
# Match cat or cats | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- Matching Zero or More ---------- | |
# * matches zero or more of what proceeds it | |
randStr = "doctor doctors doctor's" | |
# Match doctor doctors or doctor's | |
regex = re.compile("[doctor]+['s]*") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
# You can do the same by setting an interval match | |
regex = re.compile("[doctor]+['s]{0,2}") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- PROBLEM ---------- | |
# On Windows newlines are some times \n and other times \r\n | |
# Create a regex that will grab each of the lines in this | |
# string, print out the number of matches and each line | |
longStr = '''Just some words | |
and some more\r | |
and more | |
''' | |
print("Matches :", len(re.findall(r"[\w\s]+[\r]?\n", longStr))) | |
matches = re.findall("[\w\s]+[\r]?\n", longStr) | |
for i in matches: | |
print(i) | |
# ---------- Greedy & Lazy Matching ---------- | |
randStr = "<name>Life On Mars</name><name>Freaks and Geeks</name>" | |
# Let's try to grab everything between <name> tags | |
# Because * is greedy (It grabs the biggest match possible) | |
# we can't get what we want, which is each individual tag | |
# match | |
regex = re.compile(r"<name>.*</name>") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# We want to grab the smallest match we use *?, +?, or | |
# {n,}? instead | |
regex = re.compile(r"<name>.*?</name>") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- Word Boundaries ---------- | |
# We use word boundaries to define where our matches start | |
# and end | |
# \b matches the start or end of a word | |
# If we want ape it will match ape and the beginning of apex | |
randStr = "ape at the apex" | |
regex = re.compile(r"ape") | |
# If we use the word boundary | |
regex = re.compile(r"\bape\b") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- String Boundaries ---------- | |
# ^ : Matches the beginning of a string if outside of | |
# a [ ] | |
# $ : Matches the end of a string | |
# Grab everything from the start of the string to @ | |
randStr = "Match everything up to @" | |
regex = re.compile(r"^.*[^@]") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# Grab everything from @ to the end of the line | |
randStr = "@ Get this string" | |
regex = re.compile(r"[^@\s].*$") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# Grab the 1st word of each line using the the multiline | |
# code which allows for the targeting of each line after | |
# a line break with ^ | |
randStr = '''Ape is big | |
Turtle is slow | |
Cheetah is fast''' | |
regex = re.compile(r"(?m)^.*?\s") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- Subexpressions ---------- | |
# Subexpressions are parts of a larger expression | |
# If you want to match for a large block, but | |
# only want to return part of it. To do that | |
# surround what you want with ( ) | |
# Get just the number minus the area code | |
randStr = "My number is 412-555-1212" | |
regex = re.compile(r"412-(.*)") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- Problem ---------- | |
# Get just the numbers minus the area codes from | |
# this string | |
randStr = "412-555-1212 412-555-1213 412-555-1214" | |
regex = re.compile(r"412-(.{8})") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- Multiple Subexpressions ---------- | |
# You can have multiple subexpressions as well | |
# Get both numbers that follow 412 separately | |
randStr = "My number is 412-555-1212" | |
regex = re.compile(r"412-(.*)-(.*)") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
print(matches[0][0]) | |
print(matches[0][1]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Did you find a match | |
# if re.search("REGEX", yourString) | |
# Get list of matches | |
# print("Matches :", len(re.findall("REGEX", yourString))) | |
# Get a pattern object | |
# regex = re.compile("REGEX") | |
# Substitute the match | |
# yourString = regex.sub("substitution", yourString) | |
# [ ] : Match what is in the brackets | |
# [^ ] : Match anything not in the brackets | |
# ( ) : Return surrounded submatch | |
# . : Match any 1 character or space | |
# + : Match 1 or more of what proceeds | |
# ? : Match 0 or 1 | |
# * : Match 0 or More | |
# *? : Lazy match the smallest match | |
# \b : Word boundary | |
# ^ : Beginning of String | |
# $ : End of String | |
# \n : Newline | |
# \d : Any 1 number | |
# \D : Anything but a number | |
# \w : Same as [a-zA-Z0-9_] | |
# \W : Same as [^a-zA-Z0-9_] | |
# \s : Same as [\f\n\r\t\v] | |
# \S : Same as [^\f\n\r\t\v] | |
# {5} : Match 5 of what proceeds the curly brackets | |
# {5,7} : Match values that are between 5 and 7 in length | |
# ($m) : Allow ^ on multiline string | |
# ---------- Back References ---------- | |
# A back reference allows you to to reuse the expression | |
# that proceeds it | |
# Grab a double word | |
randStr = "The cat cat fell out the window" | |
# Match a word boundary, 1 or more characters followed | |
# by a space if it is then followed by the same | |
# match that is surrounded by the parentheses | |
regex = re.compile(r"(\b\w+)\s+\1") | |
matches = re.findall(regex, randStr) | |
print("Matches :", len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- Back Reference Substitutions ---------- | |
# Replace the bold tags in the link with no tags | |
randStr = "<a href='#'><b>The Link</b></a>" | |
# Regex matches bold tags and grabs the text between | |
# them to be used by the back reference | |
regex = re.compile(r"<b>(.*?)</b>") | |
# Replace the tags with just the text between them | |
randStr = re.sub(regex, r"\1", randStr) | |
print(randStr) | |
# ---------- Another Back Reference Substitution ---------- | |
# Receive this string | |
randStr = "412-555-1212" | |
# Match the phone number using multiple subexpressions | |
regex = re.compile(r"([\d]{3})-([\d]{3}-[\d]{4})") | |
# Output (412)555-1212 | |
randStr = re.sub(regex, r"(\1)\2", randStr) | |
print(randStr) | |
# ---------- PROBLEM ---------- | |
# Receive a string like this | |
randStr = "https://www.youtube.com http://www.google.com" | |
# Grab the URL and then provide the following output | |
# using a back reference substitution | |
# <a href='https://www.youtube.com'>www.youtube.com</a> | |
# <a href='https://www.google.com'>www.google.com</a> | |
regex = re.compile(r"(https?://([\w.]+))") | |
randStr = re.sub(regex, r"<a href='\1'>\2</a>\n", randStr) | |
print(randStr) | |
# ---------- Look Ahead ---------- | |
# A look ahead defines a pattern to match but not return | |
# You define the expression to look for but not return | |
# like this (?=expression) | |
randStr = "One two three four" | |
# Grab all letters and numbers of 1 or more separated | |
# by a word boundary but don't include it | |
regex = re.compile(r"\w+(?=\b)") | |
matches = re.findall(regex, randStr) | |
for i in matches: | |
print(i) | |
# ---------- Look Behind ---------- | |
# The look behind looks for what is before the text | |
# to return, but doesn't return it | |
# It is defined like (?<=expression) | |
randStr = "1. Bread 2. Apples 3. Lettuce" | |
# Find the number, period and space, but only return | |
# the 1 or more letters or numbers that follow | |
regex = re.compile(r"(?<=\d.\s)\w+") | |
matches = re.findall(regex, randStr) | |
for i in matches: | |
print(i) | |
# ---------- Look Ahead & Behind ---------- | |
randStr = "<h1>I'm Important</h1> <h1>So am I</h1>" | |
# Use the look behind, get 1 or more of anything, | |
# and use the look ahead | |
regex = re.compile(r"(?<=<h1>).+?(?=</h1>)") | |
matches = re.findall(regex, randStr) | |
for i in matches: | |
print(i) | |
import re | |
# ---------- Negative Look Ahead & Behind ---------- | |
# These are used to look for text that doesn't match | |
# the pattern | |
# (?!expression) : Negative Look Ahead | |
# (?<!expression) : Negative Look Behind | |
randStr = "8 Apples $3, 1 Bread $1, 1 Cereal $4" | |
# Grab the total number of grocery items by ignoring the $ | |
regex = re.compile(r"(?<!\$)\d+") | |
matches = re.findall(regex, randStr) | |
print(len(matches)) | |
# Convert from a string list to an int list | |
matches = [int(i) for i in matches] | |
from functools import reduce | |
# Sum the items in the list with reduce | |
print("Total Items {}".format(reduce((lambda x, y: x + y), matches))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# [ ] : Match what is in the brackets | |
# [^ ] : Match anything not in the brackets | |
# ( ) : Return surrounded submatch | |
# . : Match any 1 character or space | |
# + : Match 1 or more of what proceeds | |
# ? : Match 0 or 1 | |
# * : Match 0 or More | |
# *? : Lazy match the smallest match | |
# \b : Word boundary | |
# ^ : Beginning of String | |
# $ : End of String | |
# \n : Newline | |
# \d : Any 1 number | |
# \D : Anything but a number | |
# \w : Same as [a-zA-Z0-9_] | |
# \W : Same as [^a-zA-Z0-9_] | |
# \s : Same as [\f\n\r\t\v] | |
# \S : Same as [^\f\n\r\t\v] | |
# {5} : Match 5 of what proceeds the curly brackets | |
# {5,7} : Match values that are between 5 and 7 in length | |
# ($m) : Allow ^ on multiline string | |
# Use a back reference to substitute what is between the | |
# bold tags and eliminate the bold tags | |
# re.sub(r"<b>(.*?)</b>", r"\1", randStr) | |
# Use a look ahead to find all characters of 1 or more | |
# with a word boundary, but don't return the word | |
# boundary | |
# re.findall(r"\w+(?=\b)", randStr) | |
# Use a look behind to find words starting with a number, | |
# period and space, but only return the word that follows | |
# re.findall(r"(?<=\d.\s)\w+", randStr) | |
# Use a negative look behind to only return numbers without | |
# a $ in front of them | |
# re.findall(r"(?<!\$)\d+", randStr) | |
# ---------- OR CONDITIONAL ---------- | |
# You can use | to define the matches you'll except | |
randStr = "1. Dog 2. Cat 3. Turtle" | |
regex = re.compile(r"\d\.\s(Dog|Cat)") | |
matches = re.findall(regex, randStr) | |
print(len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- PROBLEM ---------- | |
# Create a regex that will match for 5 digit zip | |
# codes or zip codes with 5 digits a dash and | |
# then 4 digits | |
randStr = "12345 12345-1234 1234 12346-333" | |
regex = re.compile(r"(\d{5}-\d{4}|\d{5}\s)") | |
matches = re.findall(regex, randStr) | |
print(len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- GROUP ---------- | |
# We can use group to retrieve parts of regex | |
# matches | |
''' | |
bd = input("Enter your birthday (mm-dd-yyyy) : ") | |
bdRegex = re.search(r"(\d{1,2})-(\d{1,2})-(\d{4})", bd) | |
print("You were born on", bdRegex.group()) | |
print("Birth Month", bdRegex.group(1)) | |
print("Birth Day", bdRegex.group(2)) | |
print("Birth Year", bdRegex.group(3)) | |
''' | |
# ---------- MATCH OBJECT FUNCTIONS ---------- | |
# There are functions that provide more information | |
# on your matches | |
match = re.search(r"\d{2}", "The chicken weighed 13 lbs") | |
# Print the match | |
print("Match :", match.group()) | |
# Print the start and ending index of the match | |
print("Span :", match.span()) | |
# Print starting index of the match | |
print("Match :", match.start()) | |
# Print the ending index of the match | |
print("Match :", match.end()) | |
# ---------- NAMED GROUPS ---------- | |
# You can also assign names to matches | |
randStr = "December 21 1974" | |
regex = r"^(?P<month>\w+)\s(?P<day>\d+)\s(?P<year>\d+)" | |
matches = re.search(regex, randStr) | |
print("Month :", matches.group('month')) | |
print("Day :", matches.group('day')) | |
print("Year :", matches.group('year')) | |
# ---------- PROBLEM ---------- | |
# Find all of the following email addresses | |
randStr = "[email protected] [email protected] [email protected]" | |
regex = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") | |
matches = re.findall(regex, randStr) | |
print(len(matches)) | |
for i in matches: | |
print(i) | |
# ---------- PROBLEM ---------- | |
# Find all of the following phone numbers and then print them | |
randStr = "14125551212 4125551212 (412)5551212 412 555 1212 412-555-1212 1-412-555-1212" | |
regex = re.compile(r"((1?)(-| ?)(\()?(\d{3})(\)|-| |\)-|\) )?(\d{3})(-| )?(\d{4}|\d{4}))") | |
matches = re.findall(regex, randStr) | |
print(len(matches)) | |
for i in matches: | |
print(i[0].lstrip()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment