Skip to content

Instantly share code, notes, and snippets.

@dumpmycode
Last active March 29, 2016 08:06
Show Gist options
  • Save dumpmycode/b3ada7e9e80ebe750385 to your computer and use it in GitHub Desktop.
Save dumpmycode/b3ada7e9e80ebe750385 to your computer and use it in GitHub Desktop.
Google Python Class - babynames.py
#!/usr/bin/python
# Copyright 2010 Google Inc.
# Licensed under the Apache License, Version 2.0
# http://www.apache.org/licenses/LICENSE-2.0
# Google's Python Class
# http://code.google.com/edu/languages/google-python-class/
import sys
import re
"""Baby Names exercise
Define the extract_names() function below and change main()
to call it.
For writing regex, it's nice to include a copy of the target
text for inspiration.
Here's what the html looks like in the baby.html files:
...
<h3 align="center">Popularity in 1990</h3>
....
<tr align="right"><td>1</td><td>Michael</td><td>Jessica</td>
<tr align="right"><td>2</td><td>Christopher</td><td>Ashley</td>
<tr align="right"><td>3</td><td>Matthew</td><td>Brittany</td>
...
Suggested milestones for incremental development:
Extract all the text from the file and print it
Find and extract the year and print it
Extract the names and rank numbers and print them
Get the names data into a dict and print it
Build the [year, 'name rank', ... ] list and print it
Fix main() to use the ExtractNames list
def test():
# this function uses 10k+ function calls. not a good way to iterate things.
name_list = []
with open(f) as fo:
data = fo.readlines()
for line in data:
nrmatch = re.search(r'<td>(\w+)</td><td>(\w+)</td><td>(\w+)</td>', line)
if nrmatch:
name_list.append('{} {}'.format(nrmatch.group(2), nrmatch.group(1)))
name_list.append('{} {}'.format(nrmatch.group(3), nrmatch.group(1)))
name_list = sorted(name_list)
name_list.insert(0, ''.join(re.findall(r'<h3 align="center">Popularity in (\w+)</h3>', ''.join(data))))
print name_list[:10]
def test():
# this function uses 4k function calls.
# less function calls, less resources wasted.
name_list = []
with open(f) as fo:
data = fo.read()
match = re.findall(r'<td>(\w+)</td><td>(\w+)</td><td>(\w+)</td>', data)
for item in match:
name_list.append('{} {}'.format(match[1], match[0]))
name_list.append('{} {}'.format(match[3], match[0]))
name_list.sort()
name_list.insert(0, ''.join(re.findall(r'<h3 align="center">Popularity in (\w+)</h3>', data)))
"""
def extract_names(filename):
# +++your code here+++
'''
this function uses 2k function calls, tuple extraction ftw!
read fileobj then search with regex pattern using findall
which returns a list of tuples. extract tuples and assign it
to name rank list.
as the data is already sorted in ascending order, we can just
put names in rankfile if name not in there yet, thereby easily
removing any name duplicate.
'''
rankfile = []
with open(filename) as fobj:
data = fobj.read()
year = re.search(r'Popularity in (\w+)', data).group(1)
matchlist = re.findall(r'<td>(\w+)</td><td>(\w+)</td><td>(\w+)</td>', data)
for line in matchlist:
rank, male, female = line
if (male not in rankfile) or (female not in rankfile):
rankfile.append('{} {}'.format(male, rank))
rankfile.append('{} {}'.format(female, rank))
rankfile = sorted(rankfile)
rankfile.insert(0, year)
return(rankfile)
def main():
# This command-line parsing code is provided.
# Make a list of command line arguments, omitting the [0] element
# which is the script itself.
args = sys.argv[1:]
if not args:
print 'usage: [--summaryfile] file [file ...]'
sys.exit(1)
# Notice the summary flag and remove it from args if it is present.
summary = False
if args[0] == '--summaryfile':
summary = True
del args[0]
# +++your code here+++
# For each filename, get the names, then either print the text output
# or write it to a summary file
for filename in args:
mylist = extract_names(filename)
if summary:
with open(filename+'.summary', 'w') as fobj:
fobj.write('\n'.join(mylist) + '\n')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment