-
-
Save tibbiyelininja/a18d7295e4bc533fd0d6 to your computer and use it in GitHub Desktop.
BangPypers talk - Using Regular Expressions by Arvind Padmanabhan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
print(re.match(r'bc', 'abc')) | |
print(re.match(r'abc', 'abc')) | |
print(re.search(r'bc', 'abc')) | |
print(re.search(r'^bc', 'abc')) | |
print(re.search(r'^ab$', 'abc')) | |
print(re.search(r'^abc$', 'abc')) | |
print(re.search(r'a?bc', 'bc')) | |
print(re.search(r'a?bc', 'abc')) | |
print(re.search(r'a?bc', 'aabc')) | |
print(re.search(r'a*bc', 'aabc')) | |
print(re.search(r'a+bc', 'aabc')) | |
print(re.search(r'a+bc', 'bc')) | |
print(re.search(r'a*bc', 'bc')) | |
print(re.search(r'a*bc', 'BC')) | |
print(re.search(r'a*bc', 'BC', re.I)) | |
print(re.search(r'a.*bc', 'aabbcabcd')) | |
print(re.search(r'a.*?bc', 'aabbcabcd')) | |
print(re.search(r'a.*?d', 'ababdbabd')) | |
print(re.search(r'a[^d]*d', 'ababdbabd')) | |
print(re.findall(r'a[^d]*d', 'ababdbabd')) | |
print(re.search(r'\d+', 'The year is 2015.')) | |
print(re.findall(r'\w+(?=\s+Khan)', 'Salman Khan, Shahrukh Khan, Aamir Khan and Fardeen Khan are famous actors.')) | |
#====================================================================== | |
s = 'Я люблю мороженое' | |
print(s.encode()) | |
print(re.sub('л', 'ѫ', s)) | |
print(re.sub(b'\xd0\xbb', b'\xd1\xab', s.encode()).decode()) | |
#print(re.sub(b'\xd0\xbb', 'ѫ', s.encode()).decode()) # error | |
#====================================================================== | |
# Extract year, month and date | |
url = 'http://techcrunch.com/2015/08/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/' | |
print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url)) | |
url = 'http://techcrunch.com/2015/8/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/' | |
print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url)) | |
# Extract blog category or tag | |
url = 'http://iedf.in/index.php/blog/category/electrical-electronics' | |
print(re.findall(r'/blog/(category|tag)/(.*)', url)) | |
url = 'http://iedf.in/index.php/blog/tag/innovation' | |
print(re.findall(r'/blog/(category|tag)/(.*)', url)) | |
# Redirect articles of 2013 and older to archives with renamed file | |
urls = ['http://www.example.io/articles/2013/jan/how-to-format-hdd.htm', | |
'http://www.example.io/articles/2001/dec/how-to-format-hdd.htm', | |
'http://www.example.io/articles/1994/apr/how-to-format-hdd.htm', | |
'http://www.example.io/articles/2014/oct/how-to-format-hdd.htm'] | |
print([re.sub(r'/articles/(?P<year>1\d{3}|200\d|201[0-3])/(\w+)/', | |
r'/archives/\g<year>-\2-', url) | |
for url in urls]) | |
# Redirect all references from some folders to a sub-domain | |
urls = ['http://www.example.io/images/12343341.jpg', | |
'http://www.example.io/images/thumbs/798788324.png', | |
'https://www.example.io/images/events/photos/hackathon-2015/345145.png', | |
'http://www.example.io/icons/animated/aasd.gif'] | |
print([re.sub(r'^https?://www\.example\.io/(?:images|icons)/(.*\.(?:jpg|png|gif))', | |
r'https://cdn.example.io/imgs/\1', url) | |
for url in urls]) | |
#====================================================================== | |
import re | |
s = """ | |
The original document was signed on 1998-1-31. | |
An ammendment was approved on 2015-02-24. It is expected to be signed on | |
2015-10-1 | |
""" | |
# Convert from YYYY-MM-DD to DD-MM-YYYY date format | |
print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', '\\3-\\2-\\1', s)) | |
print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', r'\3-\2-\1', s)) | |
# Extract individual fields of first match | |
m = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', s) | |
print(m) | |
print(m.groups()) | |
year, month, day = m.groups() | |
print(year,month,day) | |
# Extract individual fields of first match by named groups | |
m = re.search(r'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})', s) | |
print(m.group('day'), m.group('month'), m.group('year')) | |
#====================================================================== | |
import re | |
runs = """Dhawan 23 101 44 11 76 | |
Kohli 111,23,12, 58, 90 | |
Saha 45: 8 : 37 : 65 : 121""" | |
print([re.split(r'\s*[,:]?\s*',s) for s in re.split(r'\n+',runs)]) | |
#====================================================================== | |
import re | |
s = "My name is Raju. I was born in Delhi. I came to Mumbai in 1994. Though I've lived here ever since, I miss Delhi." | |
print(re.findall(r'(?<=\.) {2,}(?=[A-Z])', s)) | |
print(re.sub(r'(?<=\.) {2,}(?=[A-Z])', ' ', s)) | |
print(re.subn(r'(?<=\.) {2,}(?=[A-Z])', ' ', s)) | |
#====================================================================== | |
import re | |
L = ['Hindi', 'English', 'Kannada', 'Urdu', 'Punjabi', 'Tamil', 'Assamese'] | |
D = {x:(len(x),len(re.findall('[aeiou]',x,re.I))) for x in L} | |
print(D) | |
#====================================================================== | |
import re | |
s = "'Well, I've tried to say \"How Doth the Little Busy Bee,\" but it all came different!' Alice replied in a very melancholy voice." | |
for i,m in enumerate(re.finditer(r'([\'"])(?!(?:ve|m|re|s|t|d|ll))(?=([^\1]*)\1)', s)): | |
print("Group {:d}: ".format(i+1)) | |
for g in m.groups(): | |
print(' '+g) | |
#====================================================================== | |
import re | |
ip_str = """ | |
Remote address 11.242.97.38 | |
111.200.251.63 was deleted | |
222.97.98.180 access denied | |
Unknown address:71.7.287.38 | |
Something123.63.97.29 | |
26.98.73.262 | |
99.125.34.153 | |
121.226.291.143 | |
122.215.259.80 | |
88.32.172.106 | |
""" | |
# Match IP address format without range checking | |
patt = re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b') | |
print(patt.findall(ip_str)) | |
# Match valid IP addresses | |
patt = re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}' | |
r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b') | |
print(patt.findall(ip_str)) | |
# Obtain individual fields from valid IP addresses | |
patt = re.compile(r'\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.' | |
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.' | |
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.' | |
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b') | |
print(patt.findall(ip_str)) | |
#====================================================================== | |
import re | |
def get_inr(number): | |
return re.sub(r"\d(?=(?:\d{2})+(\d{3})(?!\d)|(\d{3})(?!\d))","\g<0>,",str(number)) | |
def get_inr_x(number): | |
patt = re.compile(r"""\d(?= # | |
(?:\d{2})+(\d{3})(?!\d) | # >=100000 | |
(\d{3})(?!\d) # >=1000 && <100000 | |
)""", re.X) | |
return re.sub(patt,"\g<0>,", str(number)) | |
print(*map(get_inr, [10**x for x in range(10)])) | |
print(*map(get_inr, [-10**x for x in range(10)])) | |
print(*map(get_inr, [10**x+0.45 for x in range(10)])) | |
print(*map(get_inr, [-10**x-0.45 for x in range(10)])) | |
#====================================================================== | |
import re | |
content = """ | |
When Raju is released from prison after serving two years | |
for forgery and for embezzlement, Raju goes to the temple | |
located on the Sarayu River in his hometown of Malgudi, | |
which is far from prison. He thinks prison is not too bad | |
a place, and he is wondering what to do next with his life. | |
Then a villager named Velan shows up and, taking Raju for | |
a holy wise man or guru, consults with him about his sister, | |
who refuses to marry as the family wishes. Well aware that | |
he is not a guru, Raju is evasive, but Velan brings his | |
sister anyway, and after their meeting she conforms to her | |
family’s wishes. So begins Raju’s life as a holy man. | |
""" | |
print(re.findall(r'\b(\w+)\b(?=[^\.]+\b\1\b)', content, re.I)) # correct | |
print(re.findall(r'\b(\w+)\b(?=([^\.]+)\b\1\b)', content, re.I)) # for debugging | |
print(re.findall(r'\b(\w+)\b[^\.]+\b\1\b', content, re.I)) # nested/overlapping occurrences are not matched |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment