This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Runs in Pyhton 3 | |
# Processes MSWord files with Tika. Zero exception or error control, use at own risk. | |
# Assumes lowercase .doc and .docx extensions. | |
# Output is written to new files with corresponding file exts in the same dir. | |
import os | |
import sys | |
import subprocess | |
import glob |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import os | |
# USAGE: | |
# 1. Download the Tika command prompt tool from http://tika.apache.org/download.html. | |
# 2. Put some files in the same directory. | |
# 3. Put this script in the same directory (make sure you have Python). | |
# 4. In the command line, write "python dotika.py". | |
# If Tika can extract your files, a new file with the extension .new | |
# will be created for each file matching the "extension" filter (see |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Extracts a TEI bilingual vocabulary to a term-tab-term plain-text structure""" | |
"""Note: this script uses the BeautifulSoup library for TEI parsing.""" | |
"""See http://www.crummy.com/software/BeautifulSoup/bs4 for details.""" | |
from bs4 import BeautifulSoup | |
import sys | |
import codecs | |
if len(sys.argv) < 2: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?><tmx version="1.4"> | |
<header adminlang="EN-GB" | |
creationtool="CADT TMX Generator" | |
creationtoolversion="1.0" | |
datatype="unknown" | |
o-tmf="TW4Win 2.0 Format" | |
segtype="sentence" | |
srclang="EN-GB"> | |
<prop type="Att::Source File">S_2012_10_Add_20_ES-mod</prop> | |
</header> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import os.path | |
import zipfile | |
from datetime import datetime | |
from re import sub | |
sourceFile = zipfile.ZipFile('text.docx') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import codecs | |
import os | |
import re | |
import unicodedata | |
# Loads a UTF-8 text file into memory as a character string | |
def readDoc(filename): | |
file = codecs.open(filename, "r", "UTF-8") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import codecs | |
import os | |
import re | |
import unicodedata | |
# Loads a UTF-8 text file into memory as a character string | |
def readDoc(filename): | |
file = codecs.open(filename, "r", "UTF-8") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# Counts Arabic words with 1-10 characters in a text file. | |
# Counts hapaxes. | |
# Shadda is assumed to be equal to (1) character. | |
# All other diacritics and punctuation are discarded. | |
# Assumes tokenized UTF-8 input. | |
import sys | |
import os.path |