Last active
January 31, 2016 17:57
-
-
Save gibiansky/99acba95707f856fa253 to your computer and use it in GitHub Desktop.
Fancy parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# If you use triple quotes, the strings can span multiple lines... | |
# Let's define our test data... | |
my_test_data = """ | |
<strong>LHD Contact Information:</strong> | |
</div> | |
<div class="right"> | |
<strong>Alameda County Public Health Department</strong> <strong>(NACCHO Member)</strong><br /> | |
1000 Broadway Ste 500<br /> | |
Oakland, CA 94607-4033<br /> | |
Phone: (510)267-8000<br /> | |
Fax: (510)267-3212<br /> | |
<a href="mailto:[email protected]">E-mail</a> | |
</div> | |
""" | |
# Let's split it into lines like we usually do | |
lines = [y.strip() for y in my_test_data.split("\n")] | |
# Old way: construct the list using a list comprehension | |
new_list = [y.strip() for y in lines] # just an example, doesnt do anything | |
# New option: construct the list using a for loop | |
new_list = [] | |
for item in lines: | |
new_list.append(item.strip()) # does the same thing as the previous example, aka nothing | |
# Now use a 'for' loop to iterate over the lines, without constructing a list | |
# We construct the list using an empty list and using .append() | |
number_of_br_tags_seen = -1 # leave this at -1 until we hit a LHD Contact Information | |
phones = [] # collect the phone numbers lines into a list | |
for line in lines: | |
print("Processing " + line) | |
# reset number_of_br_tags_seen to 0 when we see LHD Contact Information in a line | |
if "LHD Contact Information" in line: # use 'in' to check for the string being in the line at any part of the line | |
print("number_of_br_tags_seen = 0") | |
number_of_br_tags_seen = 0 | |
# if number_of_br_tags_seen is <0, then it's -1, so this line means nothing | |
if number_of_br_tags_seen < 0: | |
print("skipping line") | |
continue # continue means skip the rest of the loop | |
# now if this ends with <br/> add to number_of_br_tags_seen | |
if line.endswith("<br />"): | |
number_of_br_tags_seen = number_of_br_tags_seen + 1 | |
print("number_of_br_tags_seen now equal to " + str(number_of_br_tags_seen)) | |
if number_of_br_tags_seen == 4: | |
print("found phone " + line) | |
phones.append(line) | |
# ==== Output ===== | |
# Processing | |
# skipping line | |
# Processing <strong>LHD Contact Information:</strong> | |
# number_of_br_tags_seen = 0 | |
# Processing </div> | |
# Processing <div class="right"> | |
# Processing <strong>Alameda County Public Health Department</strong> <strong>(NACCHO Member)</strong><br /> | |
# number_of_br_tags_seen now equal to 1 | |
# Processing 1000 Broadway Ste 500<br /> | |
# number_of_br_tags_seen now equal to 2 | |
# Processing Oakland, CA 94607-4033<br /> | |
# number_of_br_tags_seen now equal to 3 | |
# Processing | |
# Processing Phone: (510)267-8000<br /> | |
# number_of_br_tags_seen now equal to 4 | |
# found phone Phone: (510)267-8000<br /> | |
# Processing Fax: (510)267-3212<br /> | |
# number_of_br_tags_seen now equal to 5 | |
# Processing <a href="mailto:[email protected]">E-mail</a> | |
# Processing </div> | |
# Processing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment