shrubb · May 28, 2017 16:08
diff --git a/csv_to_txt.py b/csv_to_txt.py
 # encoding: utf-8
 import sys
 import csv
 import re

 def preprocess_text(text):
    text = text.replace('\t', '')
    text = text.replace('ё' , 'е')
    text = text.replace('…' , '...')
    for c in '–—–':
        text = text.replace(c, '-')
    for c in '‘“”`«»':
        text = text.replace(c, '"')

    text = re.sub('.*[Пп]рипев.*' , '', text)
    text = re.sub('.*[Пп]овтор.*' , '', text)

    text = re.sub('/.*'    ,  ''    , text) # всякая хрень типа " // 2 раза"
    text = re.sub('\n +'   ,  '\n'  , text) # убрать пробелы в начале строки
    text = re.sub(' +\n'   ,  '\n'  , text) # убрать пробелы в конце строки
    text = re.sub(' +'     ,  ' '   , text) # только один пробел подряд
    text = re.sub('\n\n+'  ,  '\n\n', text) # не более двух переносов строки подряд
    text = re.sub(',[^ \n]',  ', '  , text) # всегда пробел после запятой
    # дефис как тире всегда между пробелов
    text = re.sub('[ \n]-[^ \n]' , ' - '  , text)
    text = re.sub('[^ \n]-[ \n]' , ' - '  , text)

    # сделать заглавной каждую первую букву в строке
    for k in range(len(text)-1):
        if text[k].isalpha() and text[k].islower() and (k == 0 or text[k-1] == '\n'):
            text = text[:k] + text[k].upper() + text[k+1:]

    return text

 if len(sys.argv) != 3:
    print('Usage: python3 csv_to_txt.py chanson.csv chanson.txt')
    exit()

 csv_file_path = sys.argv[1]
 txt_file_path = sys.argv[2]

 import random
 songs = []

 with open(csv_file_path, 'r') as csv_file:
    csv_reader = csv.reader(csv_file, quoting=csv.QUOTE_ALL)
    next(csv_reader) # skip header

    with open(txt_file_path, 'w') as txt_file:
        csv_lines = list(csv_reader)
        random.shuffle(csv_lines)

        for line in csv_lines:
            _, author, title, text, labels = line

            song  = '======\n'
            song += 'Автор: {}\n'.format(author)
            song += 'Название: {}\n'.format(title)
            if labels:
                for label in labels.split(','):
                    song += '* {}\n'.format(label.replace('/', ', '))
            song += '======\n\n'
            song += preprocess_text(text)
            song += '\n\n'

            txt_file.write(song)
            songs.append(song)

 # import numpy as np
 # np.save('chanson.npy', np.array(songs))
	# encoding: utf-8
	import sys
	import csv
	import re

	def preprocess_text(text):
	text = text.replace('\t', '')
	text = text.replace('ё' , 'е')
	text = text.replace('…' , '...')
	for c in '–—–':
	text = text.replace(c, '-')
	for c in '‘“”`«»':
	text = text.replace(c, '"')

	text = re.sub('.[Пп]рипев.' , '', text)
	text = re.sub('.[Пп]овтор.' , '', text)

	text = re.sub('/.*' , '' , text) # всякая хрень типа " // 2 раза"
	text = re.sub('\n +' , '\n' , text) # убрать пробелы в начале строки
	text = re.sub(' +\n' , '\n' , text) # убрать пробелы в конце строки
	text = re.sub(' +' , ' ' , text) # только один пробел подряд
	text = re.sub('\n\n+' , '\n\n', text) # не более двух переносов строки подряд
	text = re.sub(',[^ \n]', ', ' , text) # всегда пробел после запятой
	# дефис как тире всегда между пробелов
	text = re.sub('[ \n]-[^ \n]' , ' - ' , text)
	text = re.sub('[^ \n]-[ \n]' , ' - ' , text)

	# сделать заглавной каждую первую букву в строке
	for k in range(len(text)-1):
	if text[k].isalpha() and text[k].islower() and (k == 0 or text[k-1] == '\n'):
	text = text[:k] + text[k].upper() + text[k+1:]

	return text

	if len(sys.argv) != 3:
	print('Usage: python3 csv_to_txt.py chanson.csv chanson.txt')
	exit()

	csv_file_path = sys.argv[1]
	txt_file_path = sys.argv[2]

	import random
	songs = []

	with open(csv_file_path, 'r') as csv_file:
	csv_reader = csv.reader(csv_file, quoting=csv.QUOTE_ALL)
	next(csv_reader) # skip header

	with open(txt_file_path, 'w') as txt_file:
	csv_lines = list(csv_reader)
	random.shuffle(csv_lines)

	for line in csv_lines:
	_, author, title, text, labels = line

	song = '======\n'
	song += 'Автор: {}\n'.format(author)
	song += 'Название: {}\n'.format(title)
	if labels:
	for label in labels.split(','):
	song += '* {}\n'.format(label.replace('/', ', '))
	song += '======\n\n'
	song += preprocess_text(text)
	song += '\n\n'

	txt_file.write(song)
	songs.append(song)

	# import numpy as np
	# np.save('chanson.npy', np.array(songs))