Created
May 28, 2017 16:08
-
-
Save shrubb/f2a973c5ea8e8101308a76a699464a5a to your computer and use it in GitHub Desktop.
Butyrka csv-to-txt preprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
import sys | |
import csv | |
import re | |
def preprocess_text(text): | |
text = text.replace('\t', '') | |
text = text.replace('ё' , 'е') | |
text = text.replace('…' , '...') | |
for c in '–—–': | |
text = text.replace(c, '-') | |
for c in '‘“”`«»': | |
text = text.replace(c, '"') | |
text = re.sub('.*[Пп]рипев.*' , '', text) | |
text = re.sub('.*[Пп]овтор.*' , '', text) | |
text = re.sub('/.*' , '' , text) # всякая хрень типа " // 2 раза" | |
text = re.sub('\n +' , '\n' , text) # убрать пробелы в начале строки | |
text = re.sub(' +\n' , '\n' , text) # убрать пробелы в конце строки | |
text = re.sub(' +' , ' ' , text) # только один пробел подряд | |
text = re.sub('\n\n+' , '\n\n', text) # не более двух переносов строки подряд | |
text = re.sub(',[^ \n]', ', ' , text) # всегда пробел после запятой | |
# дефис как тире всегда между пробелов | |
text = re.sub('[ \n]-[^ \n]' , ' - ' , text) | |
text = re.sub('[^ \n]-[ \n]' , ' - ' , text) | |
# сделать заглавной каждую первую букву в строке | |
for k in range(len(text)-1): | |
if text[k].isalpha() and text[k].islower() and (k == 0 or text[k-1] == '\n'): | |
text = text[:k] + text[k].upper() + text[k+1:] | |
return text | |
if len(sys.argv) != 3: | |
print('Usage: python3 csv_to_txt.py chanson.csv chanson.txt') | |
exit() | |
csv_file_path = sys.argv[1] | |
txt_file_path = sys.argv[2] | |
import random | |
songs = [] | |
with open(csv_file_path, 'r') as csv_file: | |
csv_reader = csv.reader(csv_file, quoting=csv.QUOTE_ALL) | |
next(csv_reader) # skip header | |
with open(txt_file_path, 'w') as txt_file: | |
csv_lines = list(csv_reader) | |
random.shuffle(csv_lines) | |
for line in csv_lines: | |
_, author, title, text, labels = line | |
song = '======\n' | |
song += 'Автор: {}\n'.format(author) | |
song += 'Название: {}\n'.format(title) | |
if labels: | |
for label in labels.split(','): | |
song += '* {}\n'.format(label.replace('/', ', ')) | |
song += '======\n\n' | |
song += preprocess_text(text) | |
song += '\n\n' | |
txt_file.write(song) | |
songs.append(song) | |
# import numpy as np | |
# np.save('chanson.npy', np.array(songs)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment