Created
February 14, 2013 20:53
-
-
Save irctrakz/4956305 to your computer and use it in GitHub Desktop.
Trac searchattachments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import os | |
import commands | |
import re | |
import urllib | |
import dircache | |
import logging | |
import string, time | |
# import random, md5 | |
from trac.core import * | |
from trac.config import (PathOption, IntOption, BoolOption, Section, Configuration) | |
from trac.search import ISearchSource | |
from trac.attachment import IAttachmentChangeListener | |
from trac.util.datefmt import localtz | |
from trac.util.text import to_unicode | |
from datetime import datetime | |
class AttachmentSearchPlugin(Component): | |
"""Implements a search function within uploaded attachments | |
integrated into the normal search function | |
""" | |
implements(ISearchSource, IAttachmentChangeListener) | |
def _absolute_index_dir(self): | |
return os.path.join(self.env.path, 'attachments', 'index') | |
absolute_index_dir = property(_absolute_index_dir) | |
swish = PathOption('searchattachments', 'swish', | |
doc="Absolute path to swish binary.") | |
seat = PathOption('searchattachments', 'seat', | |
doc="Absolute path to trac-seat utility") | |
max_excerpt_search = IntOption( | |
'searchattachments', 'max_excerpt_search', default=500000, | |
doc="max. number of bytes read from files containing the keywords to" | |
" extract an excerpt") | |
excerpt_before = IntOption( | |
'searchattachments', 'excerpt_before', default=100, | |
doc="start search excerpt before found keyword (no. of characters)") | |
excerpt_after = IntOption( | |
'searchattachments', 'excerpt_after', default=400, | |
doc="end search excerpt after found keyword (no. of characters)") | |
############################# | |
# IAttachmentChangeListener # | |
############################# | |
def attachment_added(self, attachment): | |
if self._create_metafile(attachment): | |
self._build_index() | |
return True | |
else: | |
self.env.log.error('Error while creating meta:%s' % attachment.filename) | |
return False | |
def attachment_deleted(self, attachment): | |
meta_path = attachment.path + '.meta' | |
# Remove meta file | |
try: | |
os.remove(meta_path) | |
except: | |
self.env.log.warn('Cannot delete %s.meta' % attachment.filename) | |
# Rebuild index | |
# running this in the background will result in unindexed attachment | |
# content for 'overwrite existing attachment with the same name" | |
# deletion & immediate re-addition, so force foreground | |
self._build_index(background=False) | |
return | |
########################### | |
# ISearchSource listeners # | |
########################### | |
def get_search_filters(self, req): | |
if 'WIKI_VIEW' in req.perm('wiki'): | |
yield('attachments', 'Attachments') | |
def get_search_results(self, req, keywords, filters): | |
if not 'attachments' in filters: | |
return | |
# Prepare keywords | |
query = ' '.join(keywords) | |
#for word in keywords : | |
# query = query + word + ' ' | |
query.rstrip(' ') | |
self.env.log.debug('Search query: %s' % query) | |
# Run external command to get raw search result | |
index_file = self._get_index_file() | |
if not index_file: | |
return | |
cmd = '%s -f %s -w %s' % (self.swish, index_file, query) | |
self.env.log.debug('command % s =' % cmd) | |
error, output = commands.getstatusoutput(cmd.encode("iso8859-15")) | |
if error : | |
# TODO:Just return or raise exception here? | |
self.env.log.error(output) | |
raise Exception(output) | |
#return | |
# Parse output of the command | |
for line in output.split('\n'): | |
line = line.strip(' ') | |
if line and line[0] != '#' : | |
self.env.log.debug('line = %s ' % line) | |
# This is not a comment... let's parse the line | |
pattern = re.compile('^(\d*) (.*)/(.*)\.meta "(.*)" (\d*)$') | |
hit = pattern.match(line) | |
if hit: | |
sw_rank = hit.group(1) | |
# safeguard against how the swish index files have been | |
# created | |
sw_abs_dir = os.path.normpath(hit.group(2)) | |
sw_filename = hit.group(3) | |
sw_title = hit.group(4) | |
sw_end = hit.group(5) | |
regexp = '^' + self.env.path + '/attachments/(.*)$' | |
p = re.compile(regexp) | |
m = p.match(sw_abs_dir) | |
if m : | |
sw_dir = m.group(1) | |
file = os.path.join(sw_abs_dir, sw_filename) | |
else: | |
sw_dir = sw_abs_dir | |
file = os.path.join(self.env.path, 'attachments', sw_dir, sw_filename) | |
# Build variables that we'll return for this hit | |
relative_url = 'attachment/%s/%s' % (sw_dir, sw_filename) | |
title = 'Attachment::%s/%s' % (sw_dir, urllib.url2pathname(sw_filename)) | |
absolute_url = '/'.join([req.href(), relative_url]) | |
if os.path.exists(file): | |
date = datetime.fromtimestamp(os.path.getmtime(file), localtz) | |
else: | |
return | |
if self.max_excerpt_search > 0: | |
excerpt = self._make_excerpt(file + '.meta', keywords) | |
else: | |
excerpt = '' | |
# Return the hits | |
yield(absolute_url, to_unicode(title), date, | |
unicode('SearchAttachments'), excerpt) | |
#################### | |
# Private methods # | |
#################### | |
def _make_excerpt(self, metafile, keywords): | |
"""Create an exceprt, starting from the first line before first matching keyword""" | |
handle = codecs.open(metafile, 'r', 'utf_8', 'ignore') | |
if handle: | |
content = handle.read(self.max_excerpt_search) | |
# Transform wildchar that may exist in the query | |
# to a regexp equivalent | |
wildchar = keywords[0].replace('*','[a-z0-9]*') | |
#regexp = '(\n)?.* ?(%s)' % wildchar | |
regexp = '(%s)' % wildchar | |
m = re.search(regexp, content.lower()) | |
if m: | |
prefix = '... ' | |
suffix = ' ...' | |
foundpos = m.start(1) | |
start = foundpos - self.excerpt_before | |
if start < 0: | |
start = 0 | |
prefix = '' | |
end = foundpos + self.excerpt_after | |
if end > len(content): | |
suffix = '' | |
excerpt = ''.join([prefix, content[start:end], suffix]) | |
else: | |
excerpt = ( | |
'[Excerpt not available (first keyword not in meta file' | |
' search limit)]') | |
else: | |
excerpt = '[Excerpt not available (cannot read the meta file)]' | |
# Cleaning up some unprintable added by catdoc | |
excerpt = multiple_replace({ '\x0a': ' ', '\x0c':' ' }, excerpt) | |
return excerpt | |
def _get_index_file(self) : | |
"""Returns the most recent index file found in the index directory""" | |
if not os.path.isdir(self.absolute_index_dir): | |
self.env.log.warning('index directory does not exist') | |
return None | |
# Read all the matching index.* files into a dictionary | |
all = {} | |
for item in dircache.listdir(self.absolute_index_dir) : | |
path = os.path.join(self.absolute_index_dir, item) | |
prefix_pattern = re.compile('^index\.swish-e\.(.*)$') | |
prefix = prefix_pattern.match(item) | |
if prefix : | |
# Can be index.xxxx or index.xxxx.prop or index.xxxx.temp | |
key = prefix.group(1) | |
if re.compile('^.*\.temp$').match(key) : | |
# Ignore files ending with *.temp | |
break | |
if not re.compile('^.*\.prop$').match(key): | |
# This is an index file ... | |
# ... add last modification time | |
all[path] = os.path.getmtime(path) | |
# Do we have indexes in the 'all' dictionary? | |
if not all: | |
self.env.log.warning( | |
'%s: %s', os.path.basename(self.env.path), | |
'attachments/index does not contain any index file') | |
return None | |
# Sort the indexes dictionary by increasing value | |
sorted_dict = list(all.iteritems()) | |
sorted_dict.sort(lambda i1, i2: cmp(i1[1], i2[1])) | |
# Get last tuple | |
last_index = sorted_dict.pop() | |
return os.path.join(self.absolute_index_dir, last_index[0]) | |
def _build_index(self, conf_file='swish.config', background=True): | |
"""Build a new index by lauching the build_index script. | |
Runs as a background process if 'background' is True(the default). | |
""" | |
self.env.log.info('Starting index update') | |
cmd = '%s "%s" index -s "%s" -c' % (self.seat, self.env.path, self.swish) | |
self.env.log.debug(cmd) | |
if background == True: | |
# Script is launched as a background process | |
# ... so we'll return immediately | |
os.system(cmd + ' &') | |
else: | |
os.system(cmd) | |
return | |
def _create_metafile(self, attachment) : | |
"""Create a metafile(ie. a text version) for the attachement.""" | |
"""Only extensions with a filter.* command defined in trac.ini will be processed.""" | |
meta_file = attachment.path + '.meta' | |
extension = self._get_extension(attachment) | |
if not extension: | |
msg = '.meta not created for %s (cannot determine file type)' % attachment.filename | |
self.env.log.error(msg) | |
return False | |
self.env.log.debug('Uploaded file with extension = %s' % extension) | |
# Iterate over filter.* in the [attachment] section of the config file | |
# to build a dictionary of extension:command | |
filters = { 'txt' : 'cp -rf "%s" "%s"' } | |
for entry in self.config['searchattachments'] : | |
p = re.compile('filter.([a-zA-Z0-9]*)') | |
m = p.match(entry) | |
if m: | |
format = m.group(1).lower() | |
filters[format] = self.config['searchattachments'].get(entry) | |
# Is there a known filter for this extension? | |
if extension in filters : | |
# Converting to text with the command defined for this extension | |
if os.path.exists(meta_file): | |
os.remove(meta_file) | |
cmd = filters[extension] % (attachment.path, meta_file) | |
self.env.log.debug("Converting to text: %s" % cmd) | |
error, output = commands.getstatusoutput(cmd) | |
# Command sucessful? | |
if error: | |
self.env.log.error('Error when converting .%s to ext :: %s' % (extension, output)) | |
return False | |
else: | |
self.env.log.info('File %s succesfully converted ot text' % attachment.filename) | |
return True | |
# Unknown extension | |
else: | |
self.env.log.warn('No filter defined for this extension. Add a filter.%s entry in trac.ini' % extension) | |
return False | |
return False | |
def _get_extension(self, attachment): | |
pattern = re.compile('^.*\.([a-zA-Z0-9]*)$') | |
suffix = pattern.match(attachment.filename.strip()) | |
if suffix: | |
ext = suffix.group(1).lower() | |
# Synonyms for txt | |
if ext in [ 'txt', 'text', 'xml' ]: | |
return 'txt' | |
else: | |
return ext | |
# No suffix...may be a text file | |
elif is_text_file(attachment.path): | |
return 'txt' | |
# Cannot determine type | |
else: | |
return False | |
########################## | |
# Some utility functions # | |
########################## | |
def is_text(content): | |
"""Determines if the content is text""" | |
text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b")) | |
_null_trans = string.maketrans("", "") | |
if "\0" in content: | |
return False | |
if not content: # Empty files are considered text | |
return True | |
# Get the non-text "binary" characters | |
binary = content.translate(_null_trans, text_characters) | |
# If more than 30% non-text characters, then | |
# this is considered a binary file | |
if len(binary) <= len(content) * 0.3: | |
return True | |
return False | |
def is_text_file(filepath, blocksize = 512): | |
"""Read the first 512 bytes of the specified file. | |
Returns True if it's a text file, False otherwise. | |
""" | |
if is_text(open(filepath).read(blocksize)): | |
return True | |
else: | |
return False | |
def multiple_replace(dict, text): | |
"""Replace in 'text', all occurences of keys specified in the input | |
dictionary with its corresponding value. Returns the new string. | |
""" | |
# Create a regular expression from the dictionary keys | |
regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys()))) | |
# For each match, look-up corresponding value in dictionary | |
return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment