Last active
March 4, 2019 14:41
-
-
Save wowkin2/482c9aac0db802f58cf8a00e5aabf9c4 to your computer and use it in GitHub Desktop.
Python parser of all non-commented links from html document.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
content = ''' | |
<img src="/images/lol/hallo.png" /> | |
/images/lol/hallo.png | |
/images/lol/hallo.png | |
//example.com/images/lol/hallo.png | |
http://example.com/images/lol/hallo.png | |
https://example.com/images/lol/hallo.png | |
<!-- /images/lol/commented.png --> | |
<!-- <img src="/images/lol/commented2.png" /> --> | |
''' | |
def parse_active_urls(html_text): | |
regexp = r'<!--[\s\S]*?-->|(?P<url>(http(s?):)?/?/.+?\.[\w\d]+)' | |
result = [item[0] for item in re.findall(regexp, html_text) if item[0]] | |
return result | |
def main(): | |
result = parse_active_urls(content) | |
for item in result: | |
print(item) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment