Created
September 10, 2010 23:15
-
-
Save sandello/574542 to your computer and use it in GitHub Desktop.
Snippets for IR-course in Yandex Data Analysis School
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# For Yandex Data Analysis School | |
"""Takes MediaWiki XML dump and extracts pages to separate files.""" | |
SUBDIRECTORY_SPREAD = 512 | |
import sys | |
import os | |
import os.path | |
from optparse import OptionParser | |
from xml.sax import parse | |
from xml.sax.handler import ContentHandler | |
def smart_open(filename, mode = "rb", encoding = "utf-8"): | |
handle = None | |
if "b" not in mode: | |
mode = mode + "b" | |
if filename == "-": | |
if "w" in mode: | |
handle = sys.stdout | |
else: | |
handle = sys.stdin | |
elif filename.endswith(".gz"): | |
import gzip | |
handle = gzip.open(filename, mode) | |
elif filename.endswith(".bz2"): | |
import bz2 | |
handle = bz2.BZ2File(filename, mode) | |
else: | |
handle = open(filename, mode) | |
if encoding: | |
import codecs | |
if "w" in mode: | |
handle = codecs.getwriter(encoding)(handle, "ignore") | |
else: | |
handle = codecs.getreader(encoding)(handle, "ignore") | |
return handle | |
class MyHandler(ContentHandler): | |
def __init__(self, callback): | |
self.callback = callback | |
self.counter = 0 | |
self.clear() | |
def clear(self): | |
self.in_page = False | |
self.in_title = False | |
self.in_text = False | |
self.is_redirect = False | |
self.current_title = u"" | |
self.current_text = u"" | |
self.counter += 1 | |
def startElement(self, name, attrs): | |
if name == "page": | |
self.in_page = True | |
elif self.in_page: | |
if name == "redirect": | |
self.is_redirect = True | |
elif name == "title": | |
assert(not self.in_title and not self.in_text) | |
self.in_title = True | |
elif name == "text": | |
assert(not self.in_title and not self.in_text) | |
self.in_text = True | |
def endElement(self, name): | |
if name == "page": | |
assert(not self.in_title) | |
assert(not self.in_text) | |
if not self.is_redirect: | |
self.callback(self.counter, self.current_title.strip(), self.current_text.strip()) | |
self.clear() | |
elif self.in_page: | |
if name == "title": | |
assert(self.in_title and not self.in_text) | |
self.in_title = False | |
elif name == "text": | |
assert(not self.in_title and self.in_text) | |
self.in_text = False | |
def characters(self, data): | |
if self.in_title: | |
self.current_title += data | |
if self.in_text: | |
self.current_text += data | |
def extract_pages(stream, output_directory): | |
def extract_page(aydee, title, text): | |
subdirectory = "{0:03d}".format(aydee % SUBDIRECTORY_SPREAD) | |
filename = "{0:08d}.txt".format(aydee) | |
output_path = os.path.join(output_directory, subdirectory) | |
if not os.path.isdir(output_path): | |
os.mkdir(output_path) | |
output_path = os.path.join(output_path, filename) | |
print output_path | |
f = smart_open(output_path, "w") | |
f.write(title) | |
f.write(u"\n\n") | |
f.write(text) | |
f.close() | |
return parse(stream, MyHandler(extract_page)) | |
if __name__ == "__main__": | |
parser = OptionParser(description = __doc__) | |
parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE") | |
parser.add_option("-o", "--output", metavar = "DIRECTORY", help = "write files to DIRECTORY") | |
(options, args) = parser.parse_args() | |
if not options.input or not options.output: | |
parser.error("-i, -o options are required.") | |
parser.print_usage() | |
sys.exit(1) | |
output_directory = os.path.abspath(options.output) | |
if not os.path.isdir(output_directory): | |
print >>sys.stderr, "-o option should point to directory." | |
sys.exit(1) | |
input_file = smart_open(options.input, encoding = None) | |
extract_pages(input_file, output_directory) | |
input_file.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# For Yandex Data Analysis School | |
"""Takes MediaWiki XML dump as input and trims it to first N pages.""" | |
import sys | |
from optparse import OptionParser | |
def smart_open(filename, mode = "rb", encoding = "utf-8"): | |
handle = None | |
if "b" not in mode: | |
mode = mode + "b" | |
if filename == "-": | |
if "w" in mode: | |
handle = sys.stdout | |
else: | |
handle = sys.stdin | |
elif filename.endswith(".gz"): | |
import gzip | |
handle = gzip.open(filename, mode) | |
elif filename.endswith(".bz2"): | |
import bz2 | |
handle = bz2.BZ2File(filename, mode) | |
else: | |
handle = open(filename, mode) | |
if encoding: | |
import codecs | |
if "w" in mode: | |
handle = codecs.getwriter(encoding)(handle, "ignore") | |
else: | |
handle = codecs.getreader(encoding)(handle, "ignore") | |
return handle | |
def trim_dump(stream, required_number_of_pages): | |
number_of_pages = 0 | |
should_close_mediawiki = False | |
for line in stream: | |
if line.find(u"<mediawiki") != -1: | |
should_close_mediawiki = True | |
if line.find(u"</mediawiki>") != -1: | |
should_close_mediawiki = False | |
if line.find(u"</page>") != -1: | |
number_of_pages += 1 | |
yield line | |
if number_of_pages == required_number_of_pages: | |
break | |
if should_close_mediawiki: | |
yield u"</mediawiki>\n" | |
if __name__ == "__main__": | |
parser = OptionParser(description = __doc__) | |
parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE") | |
parser.add_option("-o", "--output", metavar = "FILE", help = "write trimmed dump to FILE") | |
parser.add_option("-n", "--number", metavar = "N", help = "trim dump to first N pages", type = "int") | |
(options, args) = parser.parse_args() | |
if not options.input or not options.output or not options.number: | |
parser.error("-i, -o, -n options are required.") | |
parser.print_usage() | |
sys.exit(1) | |
input_file = smart_open(options.input) | |
output_file = smart_open(options.output, "w") | |
output_file.writelines(trim_dump(input_file, options.number)) | |
input_file.close() | |
output_file.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// For Yandex Data Analysis School | |
// XXX: Включите этот define в зависимости от вашей операционной системы | |
#define WINDOWS | |
#include <string> | |
#include <fstream> | |
#include <iostream> | |
// ICU: http://icu-project.org | |
#include <unicode/utypes.h> | |
#include <unicode/unistr.h> | |
#include <unicode/uniset.h> | |
#ifdef WINDOWS | |
#include <windows.h> | |
#endif | |
// ==== Выделение слов из потока | |
// | |
// Фунции PrintWords* читают входящий поток is и построчно в os выписывает слова, переводя их в нижний регистр. | |
// Входная и выходная кодировки потоков -- UTF-8. | |
// Ниже приведены три возможные реализации функции: | |
// - выделение слов как последовательности алфавитных символов с помощью UnicodeSet, | |
// - выделение слов как последовательности алфавитных символов с помощью IsAlphabetic, | |
// - разделение исходной строки по пробельным символам. | |
// | |
// ==== Вспомогательные функции | |
// | |
// Для справки, список потенциально полезных функций определения типа символа: | |
// - u_isUAlphabetic | |
// - u_isULowercase | |
// - u_isUUppercase | |
// - u_isUWhiteSpace | |
// | |
// А также функции в стиле ctypes: | |
// - u_islower (u_tolower) | |
// - u_isupper (u_toupper) | |
// - u_istitle (u_totitle) | |
// - u_isdigit | |
// - u_isalpha | |
// - u_isalnum | |
// - u_isxdigit | |
// - u_ispunct | |
// - u_isgraph | |
// - u_isblank | |
// - u_isspace | |
// - u_iscntrl | |
// - u_isprint | |
// | |
// ==== Продвинутые техники | |
// | |
// Более продвинутый метод итерации по словам -- это использование BreakIterator. | |
// Для простоты этот метод не рассматривается. | |
// | |
// Также можно для каких-либо своих целей использовать регулярные выражения. | |
// См. классы RegexPattern и RegexMatcher. | |
// | |
// ==== API References | |
// | |
// http://icu-project.org/apiref/icu4c/index.html | |
// | |
// В частности: | |
// http://icu-project.org/apiref/icu4c/classUnicodeString.html | |
// http://icu-project.org/apiref/icu4c/classUnicodeSet.html | |
// | |
// http://icu-project.org/apiref/icu4c/uchar_8h.html -- классификация символов. | |
// | |
// http://icu-project.org/apiref/icu4c/classBreakIterator.html | |
// http://icu-project.org/apiref/icu4c/classRegexMatcher.html | |
// http://icu-project.org/apiref/icu4c/classRegexPattern.html | |
#ifdef WINDOWS | |
// Хак для печати UTF-8 в Windows-консоли в обход стандартных операторов <<. | |
std::ostream& operator<<(std::ostream& os, const std::string& s) | |
{ | |
static HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); | |
static HANDLE stderr_handle = GetStdHandle(STD_ERROR_HANDLE); | |
if (os == std::cout) | |
{ | |
DWORD characters_written; | |
WriteConsoleA(stdout_handle, s.data(), s.length(), &characters_written, NULL); | |
} | |
else if (os == std::cerr) | |
{ | |
DWORD characters_written; | |
WriteConsoleA(stderr_handle, s.data(), s.length(), &characters_written, NULL); | |
} | |
else | |
{ | |
os.write(s.data(), s.length()); | |
} | |
return os; | |
} | |
#endif | |
void PrintWordsWithUnicodeSet(std::istream& is, std::ostream& os) | |
{ | |
UnicodeSet allowed_characters; | |
UnicodeString current_line; | |
{ | |
UErrorCode error_code = U_ZERO_ERROR; | |
allowed_characters.applyPattern("[\\p{Letter}]", error_code); | |
if (U_FAILURE(error_code)) | |
{ | |
std::cerr << "Failed to create set of allowed characters." << std::endl; | |
std::exit(1); | |
} | |
} | |
std::string buffer; | |
while (std::getline(is, buffer)) | |
{ | |
current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length())); | |
current_line.toLower(); | |
int32_t i, j; | |
for (i = 0; i < current_line.length(); ++i) | |
{ | |
for (j = i; j < current_line.length() && allowed_characters.contains(current_line[j]); ++j) | |
{ | |
} | |
if (j > i) | |
{ | |
UnicodeString word(current_line, i, j - i); | |
buffer.clear(); | |
word.toUTF8String(buffer); | |
os << buffer << std::endl; | |
} | |
i = j; | |
} | |
} | |
} | |
void PrintWordsWithIsAlphabetic(std::istream& is, std::ostream& os) | |
{ | |
UnicodeString current_line; | |
std::string buffer; | |
while (std::getline(is, buffer)) | |
{ | |
current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length())); | |
current_line.toLower(); | |
int32_t i, j; | |
for (i = 0; i < current_line.length(); ++i) | |
{ | |
for (j = i; j < current_line.length() && u_isUAlphabetic(current_line[j]); ++j) | |
{ | |
} | |
if (j > i) | |
{ | |
UnicodeString word(current_line, i, j - i); | |
buffer.clear(); | |
word.toUTF8String(buffer); | |
os << buffer << std::endl; | |
} | |
i = j; | |
} | |
} | |
} | |
void PrintWordsBySpaces(std::istream& is, std::ostream& os) | |
{ | |
UnicodeString current_line; | |
std::string buffer; | |
while (std::getline(is, buffer)) | |
{ | |
current_line = UnicodeString::fromUTF8(StringPiece(buffer.c_str(), buffer.length())); | |
current_line.toLower(); | |
int32_t i, j; | |
for (i = 0; i < current_line.length(); ++i) | |
{ | |
for (j = i; j < current_line.length() && !u_isUWhiteSpace(current_line[j]); ++j) | |
{ | |
} | |
if (j > i) | |
{ | |
UnicodeString word(current_line, i, j - i); | |
buffer.clear(); | |
word.toUTF8String(buffer); | |
os << buffer << std::endl; | |
} | |
i = j; | |
} | |
} | |
} | |
int main(int argc, char** argv) | |
{ | |
#ifdef WINDOWS | |
UINT _previous_cp = GetConsoleCP(); | |
UINT _previous_output_cp = GetConsoleOutputCP(); | |
::SetConsoleCP(CP_UTF8); | |
::SetConsoleOutputCP(CP_UTF8); | |
#endif | |
if (argc < 2) | |
{ | |
std::cerr << "Please, specify input file as an argument." << std::endl; | |
return 1; | |
} | |
std::ifstream input_file(argv[1], std::ios::binary | std::ios::in); | |
if (!input_file) | |
{ | |
std::cerr << "Cannot open file." << std::endl; | |
return 1; | |
} | |
//PrintWordsWithUnicodeSet(input_file, std::cout); | |
//PrintWordsWithIsAlphabetic(input_file, std::cout); | |
PrintWordsBySpaces(input_file, std::cout); | |
#ifdef WINDOWS | |
::SetConsoleCP(_previous_cp); | |
::SetConsoleOutputCP(_previous_output_cp); | |
#endif | |
return 0; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# For Yandex Data Analysis School | |
"""Takes stripped MediaWiki XML dump and calculates different statistics.""" | |
import sys | |
import unicodedata | |
from optparse import OptionParser | |
IS_PUNCTUATION = lambda u: unicodedata.category(u)[0] == "P" | |
IS_WHITESPACE = lambda u: unicodedata.category(u)[0] == "Z" | |
IS_LETTER = lambda u: unicodedata.category(u)[0] == "L" | |
IS_NUMBER = lambda u: unicodedata.category(u)[0] == "N" | |
IS_LOWERCASE = lambda u: unicodedata.category(u) == "Ll" | |
IS_UPPERCASE = lambda u: unicodedata.category(u) == "Lu" | |
IS_CYRILLIC = lambda u: (ord(u) >> 8) == 0x04 | |
def smart_open(filename, mode = "rb", encoding = "utf-8"): | |
handle = None | |
if "b" not in mode: | |
mode = mode + "b" | |
if filename == "-": | |
if "w" in mode: | |
handle = sys.stdout | |
else: | |
handle = sys.stdin | |
elif filename.endswith(".gz"): | |
import gzip | |
handle = gzip.open(filename, mode) | |
elif filename.endswith(".bz2"): | |
import bz2 | |
handle = bz2.BZ2File(filename, mode) | |
else: | |
handle = open(filename, mode) | |
if encoding: | |
import codecs | |
if "w" in mode: | |
handle = codecs.getwriter(encoding)(handle, "ignore") | |
else: | |
handle = codecs.getreader(encoding)(handle, "ignore") | |
return handle | |
def tokenize(stream): | |
for line in stream: | |
for word in line.split(): | |
if word[0] != u"<": | |
word = u"".join(c for c in word if IS_LETTER(c)) | |
word = word.lower() | |
if not word: | |
continue | |
yield word | |
def calculate_statistics(word_stream): | |
number_of_documents = 0 | |
number_of_words = 0 | |
number_of_pointers = 0 | |
global_vocabulary = set() | |
local_vocabulary = set() | |
in_page = False | |
in_title = False | |
in_text = False | |
BLOCKED_WORDS = u"<b> <h> <i> <id> <ref </b> </h> </i> </id> </ref>".split() | |
for word in word_stream: | |
if word in BLOCKED_WORDS: | |
continue | |
elif word == u"<main_text>": | |
assert(in_page and not in_title and not in_text) | |
in_text = True | |
elif word == u"</main_text>": | |
assert(in_page and not in_title and in_text) | |
in_text = False | |
elif word == u"<page>": | |
assert(not in_page and not in_title and not in_text) | |
in_page = True | |
number_of_documents += 1 | |
number_of_pointers += len(local_vocabulary) | |
local_vocabulary.clear() | |
elif word == u"</page>": | |
assert(in_page and not in_title and not in_text) | |
in_page = False | |
elif word == u"<title>": | |
assert(in_page and not in_title and not in_text) | |
in_title = True | |
elif word == u"</title>": | |
assert(in_page and in_title and not in_text) | |
in_title = False | |
else: | |
assert(in_page and (in_title or in_text)) | |
number_of_words += 1 | |
global_vocabulary.add(word) | |
local_vocabulary.add(word) | |
print >>sys.stderr, u"# Documents: {0}".format(number_of_documents) | |
print >>sys.stderr, u"# Words: {0}".format(number_of_words) | |
print >>sys.stderr, u"# Pointers: {0}".format(number_of_pointers) | |
print >>sys.stderr, u"# Distinct Words: {0}".format(len(global_vocabulary)) | |
if __name__ == "__main__": | |
parser = OptionParser(description = __doc__) | |
parser.add_option("-i", "--input", metavar = "FILE", help = "read dump from FILE") | |
(options, args) = parser.parse_args() | |
if not options.input: | |
parser.error("-i option is required.") | |
parser.print_usage() | |
sys.exit(1) | |
input_file = smart_open(options.input) | |
calculate_statistics(tokenize(input_file)) | |
input_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment