Dmitry Chaplinsky dchaplinsky

pip install markdown imgkit

You'll also need to install wkhtmltopdf on your system:

On Ubuntu/Debian: sudo apt-get install wkhtmltopdf On macOS: brew install wkhtmltopdf On Windows: Download the installer from the wkhtmltopdf website.

	#!/usr/bin/env python3
	"""Convert TMX files to ParaConc format.

	This script converts TMX (Translation Memory eXchange) files to ParaConc format,
	which consists of three separate XML files: source language, target language,
	and alignment information. It supports complex alignment patterns, HTML tag preservation,
	and includes input validation.

	Example usage:
	python tmx_to_paraconc.py input.tmx -o output_prefix

	import os
	import torch
	import numpy as np
	from sentence_transformers import SentenceTransformer

	# Load pre-trained model for sentence embeddings
	model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

	# Set up LSTM model
	input_size = 768 # Size of the sentence embeddings

	import json
	import argparse
	from typing import Dict
	from pathlib import Path

	import smart_open
	import ftfy
	from tqdm import tqdm
	import html2text
	from datasets import load_dataset

	import argparse
	from flair.data import Sentence
	from flair.embeddings import (
	DocumentEmbeddings,
	FlairEmbeddings,
	DocumentLMEmbeddings,
	DocumentPoolEmbeddings,
	)
	from torch import Tensor

	#!/bin/bash

	# You will need `apt get parallel pv` to make it run

	# download file containing urls
	curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt

	# create output file
	touch output.txt

	import bz2
	import logging
	import multiprocessing
	import re
	from pickle import PicklingError

	# LXML isn't faster, so let's go with the built-in solution
	from xml.etree.ElementTree import iterparse

	# pip install pymorphy3
	# pip install pymorphy3-dicts-uk

	import pymorphy3
	from collections import defaultdict
	from itertools import product
	from typing import List, List


	morph = pymorphy3.MorphAnalyzer(lang="uk")

	import os.path
	from flair.data import Dictionary
	from flair.models import LanguageModel
	from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus


	def train_flair_embeddings(
	corpus_path="/data/ubertext/for_flair",
	dictionary_path="/home/dima/Projects/flair_embeddings/flair_dictionary.pkl",
	lm_file="./language_model_forward_no_amp_accum_grad_fixed",

	import wn
	import csv

	from collections import Counter, defaultdict
	from tqdm.notebook import tqdm


	wn.download("pwn:3.1")
	pwn = wn.Wordnet("pwn:3.1")