pip install markdown imgkitYou'll also need to install wkhtmltopdf on your system:
On Ubuntu/Debian: sudo apt-get install wkhtmltopdf
On macOS: brew install wkhtmltopdf
On Windows: Download the installer from the wkhtmltopdf website.
pip install markdown imgkitYou'll also need to install wkhtmltopdf on your system:
On Ubuntu/Debian: sudo apt-get install wkhtmltopdf
On macOS: brew install wkhtmltopdf
On Windows: Download the installer from the wkhtmltopdf website.
| #!/usr/bin/env python3 | |
| """Convert TMX files to ParaConc format. | |
| This script converts TMX (Translation Memory eXchange) files to ParaConc format, | |
| which consists of three separate XML files: source language, target language, | |
| and alignment information. It supports complex alignment patterns, HTML tag preservation, | |
| and includes input validation. | |
| Example usage: | |
| python tmx_to_paraconc.py input.tmx -o output_prefix |
| import os | |
| import torch | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| # Load pre-trained model for sentence embeddings | |
| model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") | |
| # Set up LSTM model | |
| input_size = 768 # Size of the sentence embeddings |
| import json | |
| import argparse | |
| from typing import Dict | |
| from pathlib import Path | |
| import smart_open | |
| import ftfy | |
| from tqdm import tqdm | |
| import html2text | |
| from datasets import load_dataset |
| import argparse | |
| from flair.data import Sentence | |
| from flair.embeddings import ( | |
| DocumentEmbeddings, | |
| FlairEmbeddings, | |
| DocumentLMEmbeddings, | |
| DocumentPoolEmbeddings, | |
| ) | |
| from torch import Tensor |
| #!/bin/bash | |
| # You will need `apt get parallel pv` to make it run | |
| # download file containing urls | |
| curl http://webdatacommons.org/structureddata/2022-12/files/file.list > urls.txt | |
| # create output file | |
| touch output.txt |
| import bz2 | |
| import logging | |
| import multiprocessing | |
| import re | |
| from pickle import PicklingError | |
| # LXML isn't faster, so let's go with the built-in solution | |
| from xml.etree.ElementTree import iterparse |
| # pip install pymorphy3 | |
| # pip install pymorphy3-dicts-uk | |
| import pymorphy3 | |
| from collections import defaultdict | |
| from itertools import product | |
| from typing import List, List | |
| morph = pymorphy3.MorphAnalyzer(lang="uk") |
| import os.path | |
| from flair.data import Dictionary | |
| from flair.models import LanguageModel | |
| from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus | |
| def train_flair_embeddings( | |
| corpus_path="/data/ubertext/for_flair", | |
| dictionary_path="/home/dima/Projects/flair_embeddings/flair_dictionary.pkl", | |
| lm_file="./language_model_forward_no_amp_accum_grad_fixed", |
| import wn | |
| import csv | |
| from collections import Counter, defaultdict | |
| from tqdm.notebook import tqdm | |
| wn.download("pwn:3.1") | |
| pwn = wn.Wordnet("pwn:3.1") |