Created
January 8, 2011 09:38
-
-
Save junaidpv/770718 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from urllib.request import * | |
from html.parser import * | |
import re | |
class MyParser(HTMLParser): | |
hyperlinks = [] | |
def handle_starttag(self, tag, attributes): | |
if tag=='a': | |
self.start_a(attributes) | |
def start_a(self, attributes): | |
for name, value in attributes: | |
if name=='href': | |
self.hyperlinks.append(value) | |
#web_page = urlopen('http://junaidpv.in') | |
#print(web_page.info().get('Content-Type')) | |
#my_parser = MyParser() | |
#my_parser.feed(str(web_page.read(), encoding='utf-8')) | |
#web_page.close() | |
#for link in my_parser.hyperlinks: | |
# print(link) | |
def f5(seq, idfun=None): | |
# order preserving | |
if idfun is None: | |
def idfun(x): return x | |
seen = {} | |
result = [] | |
for item in seq: | |
marker = idfun(item) | |
# in old Python versions: | |
# if seen.has_key(marker) | |
# but in new ones: | |
if marker in seen: continue | |
seen[marker] = 1 | |
result.append(item) | |
return result | |
i_file = open('input.txt', mode='r', encoding='utf-8') | |
o_file = open('output.txt', mode='w+', encoding='utf-8') | |
text = i_file.read() | |
i_file.close() | |
pattern = re.compile('[\u0D00-\u0D7F\u200C\u200D]+') | |
words = re.findall(pattern, text) | |
unique_words = f5(words) | |
print(len(words), " words.") | |
print(len(unique_words), " unique words.") | |
for line in unique_words: | |
o_file.writelines(line+"\n") | |
o_file.flush() | |
o_file.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment