Last active
January 2, 2017 22:30
-
-
Save yu-tang/6526991 to your computer and use it in GitHub Desktop.
OmegaT 用スクリプト。全分節の全参考訳文を列挙するサンプル。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// demonstrate how to get matches for each entry | |
// original concept from | |
// https://libretraduko.wordpress.com/2013/09/10/export-relavant-tus-from-legacy-tmx-files-in-omegat/ | |
// by Kos Ivantsov 2013-09-10 | |
// + | |
// Some hacks and refactoring | |
// https://gist.github.com/yu-tang/6526991 | |
// by Yu-Tang | |
// + | |
// GUI independence and many improvments taken from | |
// http://pastebin.com/bukEe6Sb | |
// by cienislaw 2013-09-19 | |
// + | |
// Add multithreading | |
// by Yu-Tang 2013-09-21 | |
// + | |
// File selector taken from | |
// https://libretraduko.wordpress.com/2013/09/10/export-relavant-tus-from-legacy-tmx-files-in-omegat/ | |
// by Kos Ivantsov + mod by Yu-Tang 2013-09-22 | |
// =================================================================== | |
// Note: Currently this script doesn't care multiple-translations. | |
// If the project has segments with alternate translation, | |
// we did not ensure a outcome. | |
//################## config start ################## | |
SELECT_FILES = 'no' // 'yes' to specify file(s) for export via file chooser. else for whole project. | |
MIN_SIMILARITY = 50 // minimum matche similarity against score. 0-100. 0 means everything. | |
SIMILARITY_TYPE = 'adjustedScore' // ( score / scoreNoStem / adjustedScore) | |
//################## config end ################## | |
import groovy.swing.SwingBuilder | |
import groovy.time.TimeCategory | |
import groovy.time.TimeDuration | |
import groovy.transform.Synchronized | |
import org.omegat.core.Core | |
import org.omegat.core.data.IProject | |
import org.omegat.core.data.PrepareTMXEntry | |
import org.omegat.core.data.ProjectProperties | |
import org.omegat.core.data.SourceTextEntry | |
import org.omegat.core.data.TMXEntry | |
import org.omegat.core.matching.NearString | |
import org.omegat.gui.matches.FindMatchesThread | |
import org.omegat.gui.scripting.IScriptLogger | |
import org.omegat.util.TMXWriter2 | |
import javax.swing.* | |
import javax.swing.filechooser.FileFilter | |
import javax.swing.filechooser.FileSystemView | |
import java.security.MessageDigest | |
import java.util.concurrent.atomic.AtomicInteger | |
class MatchesExportThread extends FindMatchesThread { | |
private Merger merger | |
private SourceTextEntry entry | |
private IProject project | |
public MatchesExportThread(final Merger merger, final IProject project, final SourceTextEntry entry) { | |
super(null, project, entry); | |
this.merger = merger | |
this.entry = entry | |
this.project = project | |
// super class gets null as matcherPane argument. | |
// So we need to override all methods which refer to matcherPane. | |
} | |
@Override | |
protected boolean isEntryChanged() { | |
//return currentlyProcessedEntry != pane.currentlyProcessedEntry; | |
return false | |
} | |
@Override | |
public void run() { | |
export() | |
merger.createThread() | |
merger.onThreadComplete() | |
} | |
private void export() { | |
// is this source already processed? | |
if (entry.duplicate != SourceTextEntry.DUPLICATE.NONE) { | |
if (! merger.addProcessedSource(entry.srcText)) { | |
return // skip already processed source | |
} | |
} | |
// is this segment already translated? | |
TMXEntry info = project.getTranslationInfo(entry) | |
if (info.isTranslated()) { | |
merger.writeEntry info | |
return | |
} | |
// search matches | |
try { | |
merger.setFoundResult search() | |
} catch (Exception ex) { | |
merger.setError ex, entry | |
} | |
} | |
} | |
/** | |
* Custom TMXWriter class | |
* | |
* <ul><li>TU countable. | |
* <li>Unique by source and translation pair. Not allow to duplicate TUs. | |
* </ul> | |
*/ | |
class HashedTMXWriter extends TMXWriter2 { | |
private MessageDigest md = MessageDigest.getInstance('SHA-256') | |
private HashSet<ByteArrayWrapper> set = new HashSet<ByteArrayWrapper>() | |
private int similarity | |
private String similarityType | |
private int fuzzyTUCount = 0 | |
/** | |
* | |
* @param file | |
* @param props ProjectProperties | |
* @param similarity | |
* @param similarityType | |
*/ | |
public HashedTMXWriter(File file, ProjectProperties props, int similarity, String similarityType) { | |
super(file, | |
props.sourceLanguage, | |
props.targetLanguage, | |
props.isSentenceSegmentingEnabled(), | |
true, // levelTwo | |
true) // forceValidTMX | |
this.similarity = similarity | |
this.similarityType = similarityType | |
} | |
/** | |
* Write one entry. | |
* | |
* @param entry TMXEntry | |
*/ | |
public void writeEntry(TMXEntry entry, boolean isFuzzy = false) { | |
ByteArrayWrapper wrap = getByteArrayWrapper(entry) | |
if (set.add(wrap)) { | |
writeEntry entry.source, entry.translation, entry, null | |
if (isFuzzy) | |
fuzzyTUCount++ | |
} | |
} | |
/** | |
* Write one entry. | |
* | |
* @param match NearString | |
*/ | |
public void writeEntry(NearString match) { | |
// filtering with simirality | |
if (match.scores[0][similarityType] < similarity) { | |
return | |
} | |
TMXEntry entry = new TMXEntry( | |
new PrepareTMXEntry( | |
source: match.source, | |
translation: match.translation, | |
changer: match.changer, | |
changeDate: match.changedDate, | |
creator: match.creator, | |
creationDate: match.creationDate, | |
note: null, | |
otherProperties: match.props | |
), | |
true, // defaultTranslation | |
null) // ExternalLinked | |
writeEntry entry, true | |
} | |
public int getTUCount() { | |
set.size() | |
} | |
public int getTranslatedTUCount() { | |
getTUCount() - getFuzzyTUCount() | |
} | |
public int getFuzzyTUCount() { | |
fuzzyTUCount | |
} | |
private byte[] getHash(String message) { | |
md.digest message.getBytes('UTF-8') | |
} | |
private ByteArrayWrapper getByteArrayWrapper(TMXEntry entry) { | |
byte[] bytes = (entry.source + '\0' + entry.translation).bytes | |
new ByteArrayWrapper(bytes) | |
} | |
// inner class - byte array wrapper for appropreate equality | |
private final class ByteArrayWrapper { | |
private final byte[] data | |
public ByteArrayWrapper(byte[] data) { | |
if (! data) { | |
throw new NullPointerException() | |
} | |
this.data = data | |
} | |
@Override | |
public boolean equals(Object other) { | |
if (! (other instanceof ByteArrayWrapper)) { | |
return false | |
} | |
Arrays.equals data, ((ByteArrayWrapper)other).data | |
} | |
@Override | |
public int hashCode() { | |
return Arrays.hashCode(data) | |
} | |
} | |
} | |
/** | |
* Collect matches, filtering and write to the file. | |
* | |
*/ | |
class Merger extends Thread { | |
private List<SourceTextEntry> entries | |
private File fileTMX | |
private HashedTMXWriter writer | |
private int maxOfThreads | |
private IProject project | |
private IScriptLogger console | |
private int sizeOfEntries | |
private final AtomicInteger currentEntry = new AtomicInteger() | |
private final AtomicInteger runners = new AtomicInteger() | |
private failed = 0 | |
private final Date timeStart = new Date() | |
private final Set<String> processedSources = Collections.synchronizedSet(new HashSet<String>()) | |
@Override | |
public synchronized void start() { | |
// get rid of autosave during processing | |
Core.autoSave.disable() | |
sizeOfEntries = entries.size() | |
// create matches search threads | |
int cnt = maxOfThreads | |
while (cnt-- && createThread()) ; | |
} | |
public void setError(final Exception error, final SourceTextEntry entry) { | |
failed++ | |
new SwingBuilder().doLater() { | |
console.println "Error on seg. #${entry.entryNum()}: ${entry.srcText}" | |
console.println error | |
} | |
} | |
/** | |
* Write TUs from matches. | |
* | |
* @param entry TMXEntry | |
*/ | |
@Synchronized("writer") | |
public void setFoundResult(final List<NearString> matches) { | |
matches.each writer.&writeEntry | |
} | |
/** | |
* Write one TU from translated segment. | |
* | |
* @param entry TMXEntry | |
*/ | |
@Synchronized("writer") | |
public void writeEntry(TMXEntry entry) { | |
writer.writeEntry entry | |
} | |
/** | |
* create new MatchesExportThread thread | |
* | |
* @return Returns true if new thread was created and false otherwise. | |
*/ | |
public boolean createThread() { | |
int index = currentEntry.getAndIncrement() | |
boolean createMore = index < sizeOfEntries | |
if (createMore) { | |
new MatchesExportThread(this, project, entries[index]).start() | |
runners.incrementAndGet() | |
} | |
createMore | |
} | |
/** | |
* Adds the source string to HashSet if it is not already present. | |
* | |
* @param source String | |
* @return true if this set did not already contain the specified element | |
*/ | |
public boolean addProcessedSource(String source) { | |
processedSources.add(source) | |
} | |
/* called when each thread completed */ | |
public void onThreadComplete() { | |
if (runners.decrementAndGet() == 0) { | |
onExit() | |
} | |
} | |
/* called when all threads completed */ | |
private void onExit() { | |
writer.close() | |
// delete TMX file when it has no TU | |
if (writer.TUCount == 0) { | |
def folder = fileTMX.parentFile | |
fileTMX.delete() | |
if (folder.list().size() == 0) | |
folder.delete() // delete empty folder too | |
} | |
// output summary | |
consolePrintln getSummary() | |
// restore AutoSave | |
Core.autoSave.enable() | |
} | |
private void consolePrintln(Object... args) { | |
new SwingBuilder().doLater() {args.each console.&println} | |
} | |
private String getSummary() { | |
TimeDuration td = TimeCategory.minus( new Date(), this.timeStart ) | |
"""Exported ${writer.TUCount} TUs (translated ${writer.translatedTUCount} + fuzzy ${writer.fuzzyTUCount}).\ | |
${failed ? ' failed ' + failed + '.' : ''} | |
It took ${td.hours ? td.hours + ' h ' : ''}${td.minutes} min ${td.seconds} sec ${td.millis} ms""" | |
} | |
} | |
/** | |
* FileSystemView for restricted browse only under the specified directory | |
* | |
*/ | |
public class DirectoryRestrictedFileSystemView extends FileSystemView { | |
private File rootDirectory | |
@Override | |
public boolean isRoot(File f) { | |
return rootDirectory.equals(f) | |
} | |
@Override | |
public File[] getRoots() { | |
return [rootDirectory] as File[] | |
} | |
@Override | |
public File getHomeDirectory() { | |
return rootDirectory | |
} | |
@Override | |
public File getParentDirectory(File dir) { | |
return super.getParentDirectory(dir) | |
} | |
@Override | |
protected File createFileSystemRoot(File f) { | |
throw new UnsupportedOperationException("Not supported yet.") | |
} | |
@Override | |
public File createNewFolder(File containingDir) throws IOException { | |
throw new UnsupportedOperationException("Not supported yet.") | |
} | |
} | |
/** | |
* WhiteList based FileFilter | |
* | |
*/ | |
public class WhiteListFilter extends FileFilter{ | |
private List<File> whiteList | |
public boolean accept(File f){ | |
// the file filter must show also directories, in order to be able to look into them | |
f.isDirectory() || whiteList.contains(f) | |
} | |
public String getDescription(){ | |
return "OmegaT Source files"; | |
} | |
} | |
File getDestTmxFile() { | |
def folder = new File(project.projectProperties.projectRoot, 'tmx_export') | |
if (! folder.exists()) | |
folder.mkdir() | |
new File(folder, 'exported_relevant.tmx') | |
} | |
List<SourceTextEntry> getSelectedFilesEentries() { | |
String sourceRoot = project.projectProperties.sourceRoot | |
File rootDir = new File(sourceRoot) | |
List<SourceTextEntry> entries = new ArrayList<SourceTextEntry>() | |
def projectFiles = project.projectFiles | |
JFileChooser fc = new JFileChooser( | |
rootDir, | |
new DirectoryRestrictedFileSystemView(rootDirectory: rootDir)) | |
fc.acceptAllFileFilterUsed = false | |
fc.addChoosableFileFilter new WhiteListFilter( | |
whiteList: projectFiles.collect() { new File(rootDir, it.filePath) }) | |
fc.dialogTitle = 'Choose files to export' | |
fc.fileSelectionMode = JFileChooser.FILES_ONLY | |
fc.multiSelectionEnabled = true | |
if (fc.showOpenDialog(mainWindow.applicationFrame) != JFileChooser.APPROVE_OPTION) { | |
console.println 'Canceled' | |
return entries | |
} | |
def files = fc.selectedFiles | |
def lenSourceDirPath = sourceRoot.size() | |
console.println "Choosed ${files.size()} file(s)." | |
files.each() { file -> | |
def fi = projectFiles.find() { | |
it.filePath == file.canonicalPath[lenSourceDirPath..-1] } | |
entries += fi.entries | |
} | |
entries | |
} | |
//=========================================== | |
// main flow | |
//=========================================== | |
// check prerequisite | |
if (! project.isProjectLoaded()) { | |
console.println "no project found." | |
return | |
} | |
def entries = SELECT_FILES == 'yes' ? | |
getSelectedFilesEentries() : | |
project.allEntries | |
// abort when no entry | |
if (entries.size() == 0) { | |
console.println "no entry found." | |
return | |
} | |
File tmx = getDestTmxFile() | |
HashedTMXWriter writer = new HashedTMXWriter(tmx, | |
project.projectProperties, | |
MIN_SIMILARITY, | |
SIMILARITY_TYPE) | |
writer.writeComment " Default translations " | |
int processors = Runtime.runtime.availableProcessors() | |
new Merger(entries: entries, | |
fileTMX: tmx, | |
writer: writer, | |
maxOfThreads: processors, | |
project: project, | |
console: console) | |
.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment