Last active
March 29, 2016 08:06
-
-
Save dumpmycode/b3ada7e9e80ebe750385 to your computer and use it in GitHub Desktop.
Google Python Class - babynames.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Copyright 2010 Google Inc. | |
# Licensed under the Apache License, Version 2.0 | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# Google's Python Class | |
# http://code.google.com/edu/languages/google-python-class/ | |
import sys | |
import re | |
"""Baby Names exercise | |
Define the extract_names() function below and change main() | |
to call it. | |
For writing regex, it's nice to include a copy of the target | |
text for inspiration. | |
Here's what the html looks like in the baby.html files: | |
... | |
<h3 align="center">Popularity in 1990</h3> | |
.... | |
<tr align="right"><td>1</td><td>Michael</td><td>Jessica</td> | |
<tr align="right"><td>2</td><td>Christopher</td><td>Ashley</td> | |
<tr align="right"><td>3</td><td>Matthew</td><td>Brittany</td> | |
... | |
Suggested milestones for incremental development: | |
Extract all the text from the file and print it | |
Find and extract the year and print it | |
Extract the names and rank numbers and print them | |
Get the names data into a dict and print it | |
Build the [year, 'name rank', ... ] list and print it | |
Fix main() to use the ExtractNames list | |
def test(): | |
# this function uses 10k+ function calls. not a good way to iterate things. | |
name_list = [] | |
with open(f) as fo: | |
data = fo.readlines() | |
for line in data: | |
nrmatch = re.search(r'<td>(\w+)</td><td>(\w+)</td><td>(\w+)</td>', line) | |
if nrmatch: | |
name_list.append('{} {}'.format(nrmatch.group(2), nrmatch.group(1))) | |
name_list.append('{} {}'.format(nrmatch.group(3), nrmatch.group(1))) | |
name_list = sorted(name_list) | |
name_list.insert(0, ''.join(re.findall(r'<h3 align="center">Popularity in (\w+)</h3>', ''.join(data)))) | |
print name_list[:10] | |
def test(): | |
# this function uses 4k function calls. | |
# less function calls, less resources wasted. | |
name_list = [] | |
with open(f) as fo: | |
data = fo.read() | |
match = re.findall(r'<td>(\w+)</td><td>(\w+)</td><td>(\w+)</td>', data) | |
for item in match: | |
name_list.append('{} {}'.format(match[1], match[0])) | |
name_list.append('{} {}'.format(match[3], match[0])) | |
name_list.sort() | |
name_list.insert(0, ''.join(re.findall(r'<h3 align="center">Popularity in (\w+)</h3>', data))) | |
""" | |
def extract_names(filename): | |
# +++your code here+++ | |
''' | |
this function uses 2k function calls, tuple extraction ftw! | |
read fileobj then search with regex pattern using findall | |
which returns a list of tuples. extract tuples and assign it | |
to name rank list. | |
as the data is already sorted in ascending order, we can just | |
put names in rankfile if name not in there yet, thereby easily | |
removing any name duplicate. | |
''' | |
rankfile = [] | |
with open(filename) as fobj: | |
data = fobj.read() | |
year = re.search(r'Popularity in (\w+)', data).group(1) | |
matchlist = re.findall(r'<td>(\w+)</td><td>(\w+)</td><td>(\w+)</td>', data) | |
for line in matchlist: | |
rank, male, female = line | |
if (male not in rankfile) or (female not in rankfile): | |
rankfile.append('{} {}'.format(male, rank)) | |
rankfile.append('{} {}'.format(female, rank)) | |
rankfile = sorted(rankfile) | |
rankfile.insert(0, year) | |
return(rankfile) | |
def main(): | |
# This command-line parsing code is provided. | |
# Make a list of command line arguments, omitting the [0] element | |
# which is the script itself. | |
args = sys.argv[1:] | |
if not args: | |
print 'usage: [--summaryfile] file [file ...]' | |
sys.exit(1) | |
# Notice the summary flag and remove it from args if it is present. | |
summary = False | |
if args[0] == '--summaryfile': | |
summary = True | |
del args[0] | |
# +++your code here+++ | |
# For each filename, get the names, then either print the text output | |
# or write it to a summary file | |
for filename in args: | |
mylist = extract_names(filename) | |
if summary: | |
with open(filename+'.summary', 'w') as fobj: | |
fobj.write('\n'.join(mylist) + '\n') | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment