Last active
April 25, 2018 05:56
-
-
Save mengdiwang/371dd53d71b63cb9d1916bdcf0df2fe5 to your computer and use it in GitHub Desktop.
regex.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf8 | |
# the above tag defines encoding for this document and is for Python 2.x compatibility | |
import re | |
regex = r'<a target="_blank" title="(.*?)" href="(.*?)">(.*?)</a>' | |
test_str = "<p> 二〇一八年四月二十日<br><br><a target=\"_blank\" title=\"上海证券交易所股票上市规则(2018年4月修订)\" href=\"/aboutus/mediacenter/hotandd/a/20180420/4a6cd527bc7bcac21dd66a2abb60d3d5.doc\">上海证券交易所股票上市规则(2018年4月修订)<br></a><a target=\"_blank\" title=\"《上海证券交易所股票上市规则(2018年4月修订)》修订说明\" href=\"/aboutus/mediacenter/hotandd/a/20180420/f086a86397ee054f984443a8ac9d1f3a.doc\">《上海证券交易所股票上市规则(2018年4月修订)》修订说明</a><br><br type=\"_moz\"> </p>" | |
matches = re.finditer(regex, test_str, re.MULTILINE | re.UNICODE) | |
for matchNum, match in enumerate(matches): | |
matchNum = matchNum + 1 | |
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group())) | |
for groupNum in range(0, len(match.groups())): | |
groupNum = groupNum + 1 | |
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum))) | |
# Note: for Python 2.7 compatibility, use ur'' to prefix the regex and u"" to prefix the test string and substitution. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment