irctrakz · February 14, 2013 20:53
diff --git a/gistfile1.txt b/gistfile1.txt
 import codecs
 import os
 import commands
 import re
 import urllib
 import dircache
 import logging
 import string, time
 # import random, md5

 from trac.core import *
 from trac.config import (PathOption, IntOption, BoolOption, Section, Configuration)
 from trac.search import ISearchSource
 from trac.attachment import IAttachmentChangeListener
 from trac.util.datefmt import localtz
 from trac.util.text import to_unicode
 from datetime import datetime

 class AttachmentSearchPlugin(Component):
    """Implements a search function within uploaded attachments
       integrated into the normal search function
    """

    implements(ISearchSource, IAttachmentChangeListener)

    def _absolute_index_dir(self):
        return os.path.join(self.env.path, 'attachments', 'index')

    absolute_index_dir = property(_absolute_index_dir)

    swish = PathOption('searchattachments', 'swish',
                       doc="Absolute path to swish binary.")

    seat = PathOption('searchattachments', 'seat',
                      doc="Absolute path to trac-seat utility")

    max_excerpt_search = IntOption(
        'searchattachments', 'max_excerpt_search', default=500000,
        doc="max. number of bytes read from files containing the keywords to"
        " extract an excerpt")
    excerpt_before = IntOption(
        'searchattachments', 'excerpt_before', default=100,
        doc="start search excerpt before found keyword (no. of characters)")
    excerpt_after = IntOption(
        'searchattachments', 'excerpt_after', default=400,
        doc="end search excerpt after found keyword (no. of characters)")

    #############################
    # IAttachmentChangeListener #
    #############################

    def attachment_added(self, attachment):

        if self._create_metafile(attachment):
            self._build_index()
            return True
        else:
            self.env.log.error('Error while creating meta:%s' % attachment.filename)
            return False

    def attachment_deleted(self, attachment):

        meta_path = attachment.path + '.meta'

        # Remove meta file
        try:
            os.remove(meta_path)
        except:
            self.env.log.warn('Cannot delete %s.meta' % attachment.filename)

        # Rebuild index
        # running this in the background will result in unindexed attachment
        # content for 'overwrite existing attachment with the same name"
        # deletion & immediate re-addition, so force foreground
        self._build_index(background=False)
        return

    ###########################
    # ISearchSource listeners #
    ###########################

    def get_search_filters(self, req):
        if 'WIKI_VIEW' in req.perm('wiki'):
            yield('attachments', 'Attachments')

    def get_search_results(self, req, keywords, filters):
        if not 'attachments' in filters:
            return

        # Prepare keywords
        query = ' '.join(keywords)
        #for word in keywords :
        #    query = query + word + ' '
        query.rstrip(' ')
        self.env.log.debug('Search query: %s' % query)

        # Run external command to get raw search result
        index_file = self._get_index_file()
        if not index_file:
            return

        cmd = '%s -f %s -w %s' % (self.swish, index_file, query)
        self.env.log.debug('command % s =' %  cmd)
        error, output = commands.getstatusoutput(cmd.encode("iso8859-15"))
        if error :
            # TODO:Just return or raise exception here?
            self.env.log.error(output)
            raise Exception(output)
            #return

        # Parse output of the command
        for line in output.split('\n'):
            line = line.strip(' ')
            if line and line[0] != '#' :
                self.env.log.debug('line = %s ' % line)
                # This is not a comment... let's parse the line
                pattern = re.compile('^(\d*) (.*)/(.*)\.meta "(.*)" (\d*)$')
                hit = pattern.match(line)
                if hit:
                    sw_rank = hit.group(1)
                    # safeguard against how the swish index files have been
                    # created
                    sw_abs_dir = os.path.normpath(hit.group(2))
                    sw_filename = hit.group(3)
                    sw_title = hit.group(4)
                    sw_end = hit.group(5)

                    regexp = '^' + self.env.path + '/attachments/(.*)$'
                    p = re.compile(regexp)
                    m = p.match(sw_abs_dir)
                    if m :
                        sw_dir = m.group(1)
                        file = os.path.join(sw_abs_dir, sw_filename)
                    else:
                        sw_dir = sw_abs_dir
                        file = os.path.join(self.env.path, 'attachments', sw_dir, sw_filename)

                    # Build variables that we'll return for this hit
                    relative_url = 'attachment/%s/%s' % (sw_dir, sw_filename)
                    title = 'Attachment::%s/%s' % (sw_dir, urllib.url2pathname(sw_filename))
                    absolute_url = '/'.join([req.href(), relative_url])
                    if os.path.exists(file):
                        date = datetime.fromtimestamp(os.path.getmtime(file), localtz)
                    else:
                        return
                    if self.max_excerpt_search > 0:
                        excerpt = self._make_excerpt(file + '.meta', keywords)
                    else:
                        excerpt = ''
                    # Return the hits
                    yield(absolute_url, to_unicode(title), date, 
                          unicode('SearchAttachments'), excerpt)


    ####################
    # Private methods  #
    ####################

    def _make_excerpt(self, metafile, keywords):
        """Create an exceprt, starting from the first line before first matching keyword"""

        handle = codecs.open(metafile, 'r', 'utf_8', 'ignore')
        if handle:
            content  = handle.read(self.max_excerpt_search) 

            # Transform wildchar that may exist in the query
            # to a regexp equivalent
            wildchar = keywords[0].replace('*','[a-z0-9]*')
            #regexp = '(\n)?.* ?(%s)' % wildchar
            regexp = '(%s)' % wildchar
            m = re.search(regexp, content.lower())
            if m:
                prefix = '... '
                suffix = ' ...'
                foundpos = m.start(1)
                start = foundpos - self.excerpt_before
                if start < 0:
                    start = 0
                    prefix = ''
                end = foundpos + self.excerpt_after
                if end > len(content):
                    suffix = ''
                excerpt = ''.join([prefix, content[start:end], suffix])
            else:
                excerpt = (
                    '[Excerpt not available (first keyword not in meta file'
                    ' search limit)]')
        else:
            excerpt = '[Excerpt not available (cannot read the meta file)]'

        # Cleaning up some unprintable added by catdoc
        excerpt = multiple_replace({ '\x0a': ' ', '\x0c':' ' }, excerpt)

        return excerpt

    def  _get_index_file(self) :
        """Returns the most recent index file found in the index directory"""

        if not os.path.isdir(self.absolute_index_dir):
            self.env.log.warning('index directory does not exist')
            return None

        # Read all the matching index.* files into a dictionary
        all = {}
        for item in dircache.listdir(self.absolute_index_dir) :
            path = os.path.join(self.absolute_index_dir, item)

            prefix_pattern = re.compile('^index\.swish-e\.(.*)$')
            prefix = prefix_pattern.match(item)
            if prefix :
                # Can be index.xxxx or index.xxxx.prop or index.xxxx.temp
                key = prefix.group(1)

                if re.compile('^.*\.temp$').match(key) :
                    # Ignore files ending with *.temp
                    break

                if not re.compile('^.*\.prop$').match(key):
                    # This is an index file ...
                    # ... add last modification time
                    all[path] = os.path.getmtime(path)


        # Do we have indexes in the 'all' dictionary?
        if not all:
            self.env.log.warning(
                '%s: %s', os.path.basename(self.env.path),
                'attachments/index does not contain any index file')
            return None

        # Sort the indexes dictionary by increasing value
        sorted_dict = list(all.iteritems())
        sorted_dict.sort(lambda i1, i2: cmp(i1[1], i2[1]))

        # Get last tuple
        last_index = sorted_dict.pop()
        return os.path.join(self.absolute_index_dir, last_index[0])

    def _build_index(self, conf_file='swish.config', background=True):
        """Build a new index by lauching the build_index script.
        Runs as a background process if 'background' is True(the default).
        """

        self.env.log.info('Starting index update')
        cmd = '%s "%s" index -s "%s" -c' % (self.seat, self.env.path, self.swish)
        self.env.log.debug(cmd)
        if background == True:
            # Script is launched as a background process
            # ... so we'll return immediately
            os.system(cmd + ' &')
        else:
            os.system(cmd)
        return


    def _create_metafile(self, attachment) :
        """Create a metafile(ie. a text version) for the attachement."""
        """Only extensions with a filter.* command defined in trac.ini will be processed."""

        meta_file = attachment.path + '.meta'
        extension = self._get_extension(attachment)

        if not extension:
            msg = '.meta not created for %s (cannot determine file type)' % attachment.filename
            self.env.log.error(msg)
            return False

        self.env.log.debug('Uploaded file with extension = %s' % extension)

        # Iterate over filter.* in the [attachment] section of the config file
        # to build a dictionary of extension:command
        filters = { 'txt' : 'cp -rf "%s" "%s"' }
        for entry in self.config['searchattachments'] :
            p = re.compile('filter.([a-zA-Z0-9]*)')
            m = p.match(entry)
            if m:
                format = m.group(1).lower()
                filters[format] = self.config['searchattachments'].get(entry)

        # Is there a known filter for this extension?
        if extension in filters :

            # Converting to text with the command defined for this extension
            if os.path.exists(meta_file):
                os.remove(meta_file)

            cmd = filters[extension] % (attachment.path, meta_file)
            self.env.log.debug("Converting to text: %s" % cmd)
            error, output = commands.getstatusoutput(cmd)

            # Command sucessful?
            if error:
                self.env.log.error('Error when converting .%s to ext :: %s' % (extension, output))
                return False
            else:
                self.env.log.info('File %s succesfully converted ot text' % attachment.filename)
                return True
        # Unknown extension
        else:
            self.env.log.warn('No filter defined for this extension. Add a filter.%s entry in trac.ini' % extension)
            return False

        return False

    def _get_extension(self, attachment):

        pattern  = re.compile('^.*\.([a-zA-Z0-9]*)$')
        suffix = pattern.match(attachment.filename.strip())

        if suffix:
            ext = suffix.group(1).lower()
            # Synonyms for txt
            if ext in [ 'txt', 'text', 'xml' ]:
                return 'txt'
            else:
                return ext

        # No suffix...may be a text file
        elif is_text_file(attachment.path):
            return 'txt'

        # Cannot determine type
        else:
            return False


 ##########################
 # Some utility functions #
 ##########################

 def is_text(content):
    """Determines if the content is text"""

    text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
    _null_trans = string.maketrans("", "")

    if "\0" in content:
        return False

    if not content:  # Empty files are considered text
        return True

    # Get the non-text "binary" characters
    binary = content.translate(_null_trans, text_characters)

    # If more than 30% non-text characters, then
    # this is considered a binary file
    if len(binary) <= len(content) * 0.3:
        return True

    return False

 def is_text_file(filepath, blocksize = 512):
    """Read the first 512 bytes of the specified file.
    Returns True if it's a text file, False otherwise.
    """

    if is_text(open(filepath).read(blocksize)):
        return True
    else:
        return False

 def multiple_replace(dict, text):
    """Replace in 'text', all occurences of keys specified in the input
    dictionary with its corresponding value.  Returns the new string.
    """

    # Create a regular expression  from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)
	import codecs
	import os
	import commands
	import re
	import urllib
	import dircache
	import logging
	import string, time
	# import random, md5

	from trac.core import *
	from trac.config import (PathOption, IntOption, BoolOption, Section, Configuration)
	from trac.search import ISearchSource
	from trac.attachment import IAttachmentChangeListener
	from trac.util.datefmt import localtz
	from trac.util.text import to_unicode
	from datetime import datetime

	class AttachmentSearchPlugin(Component):
	"""Implements a search function within uploaded attachments
	integrated into the normal search function
	"""

	implements(ISearchSource, IAttachmentChangeListener)

	def _absolute_index_dir(self):
	return os.path.join(self.env.path, 'attachments', 'index')

	absolute_index_dir = property(_absolute_index_dir)

	swish = PathOption('searchattachments', 'swish',
	doc="Absolute path to swish binary.")

	seat = PathOption('searchattachments', 'seat',
	doc="Absolute path to trac-seat utility")

	max_excerpt_search = IntOption(
	'searchattachments', 'max_excerpt_search', default=500000,
	doc="max. number of bytes read from files containing the keywords to"
	" extract an excerpt")
	excerpt_before = IntOption(
	'searchattachments', 'excerpt_before', default=100,
	doc="start search excerpt before found keyword (no. of characters)")
	excerpt_after = IntOption(
	'searchattachments', 'excerpt_after', default=400,
	doc="end search excerpt after found keyword (no. of characters)")

	#############################
	# IAttachmentChangeListener #
	#############################

	def attachment_added(self, attachment):

	if self._create_metafile(attachment):
	self._build_index()
	return True
	else:
	self.env.log.error('Error while creating meta:%s' % attachment.filename)
	return False

	def attachment_deleted(self, attachment):

	meta_path = attachment.path + '.meta'

	# Remove meta file
	try:
	os.remove(meta_path)
	except:
	self.env.log.warn('Cannot delete %s.meta' % attachment.filename)

	# Rebuild index
	# running this in the background will result in unindexed attachment
	# content for 'overwrite existing attachment with the same name"
	# deletion & immediate re-addition, so force foreground
	self._build_index(background=False)
	return

	###########################
	# ISearchSource listeners #
	###########################

	def get_search_filters(self, req):
	if 'WIKI_VIEW' in req.perm('wiki'):
	yield('attachments', 'Attachments')

	def get_search_results(self, req, keywords, filters):
	if not 'attachments' in filters:
	return

	# Prepare keywords
	query = ' '.join(keywords)
	#for word in keywords :
	# query = query + word + ' '
	query.rstrip(' ')
	self.env.log.debug('Search query: %s' % query)

	# Run external command to get raw search result
	index_file = self._get_index_file()
	if not index_file:
	return

	cmd = '%s -f %s -w %s' % (self.swish, index_file, query)
	self.env.log.debug('command % s =' % cmd)
	error, output = commands.getstatusoutput(cmd.encode("iso8859-15"))
	if error :
	# TODO:Just return or raise exception here?
	self.env.log.error(output)
	raise Exception(output)
	#return

	# Parse output of the command
	for line in output.split('\n'):
	line = line.strip(' ')
	if line and line[0] != '#' :
	self.env.log.debug('line = %s ' % line)
	# This is not a comment... let's parse the line
	pattern = re.compile('^(\d) (.)/(.)\.meta "(.)" (\d*)$')
	hit = pattern.match(line)
	if hit:
	sw_rank = hit.group(1)
	# safeguard against how the swish index files have been
	# created
	sw_abs_dir = os.path.normpath(hit.group(2))
	sw_filename = hit.group(3)
	sw_title = hit.group(4)
	sw_end = hit.group(5)

	regexp = '^' + self.env.path + '/attachments/(.*)$'
	p = re.compile(regexp)
	m = p.match(sw_abs_dir)
	if m :
	sw_dir = m.group(1)
	file = os.path.join(sw_abs_dir, sw_filename)
	else:
	sw_dir = sw_abs_dir
	file = os.path.join(self.env.path, 'attachments', sw_dir, sw_filename)

	# Build variables that we'll return for this hit
	relative_url = 'attachment/%s/%s' % (sw_dir, sw_filename)
	title = 'Attachment::%s/%s' % (sw_dir, urllib.url2pathname(sw_filename))
	absolute_url = '/'.join([req.href(), relative_url])
	if os.path.exists(file):
	date = datetime.fromtimestamp(os.path.getmtime(file), localtz)
	else:
	return
	if self.max_excerpt_search > 0:
	excerpt = self._make_excerpt(file + '.meta', keywords)
	else:
	excerpt = ''
	# Return the hits
	yield(absolute_url, to_unicode(title), date,
	unicode('SearchAttachments'), excerpt)


	####################
	# Private methods #
	####################

	def _make_excerpt(self, metafile, keywords):
	"""Create an exceprt, starting from the first line before first matching keyword"""

	handle = codecs.open(metafile, 'r', 'utf_8', 'ignore')
	if handle:
	content = handle.read(self.max_excerpt_search)

	# Transform wildchar that may exist in the query
	# to a regexp equivalent
	wildchar = keywords[0].replace('','[a-z0-9]')
	#regexp = '(\n)?.* ?(%s)' % wildchar
	regexp = '(%s)' % wildchar
	m = re.search(regexp, content.lower())
	if m:
	prefix = '... '
	suffix = ' ...'
	foundpos = m.start(1)
	start = foundpos - self.excerpt_before
	if start < 0:
	start = 0
	prefix = ''
	end = foundpos + self.excerpt_after
	if end > len(content):
	suffix = ''
	excerpt = ''.join([prefix, content[start:end], suffix])
	else:
	excerpt = (
	'[Excerpt not available (first keyword not in meta file'
	' search limit)]')
	else:
	excerpt = '[Excerpt not available (cannot read the meta file)]'

	# Cleaning up some unprintable added by catdoc
	excerpt = multiple_replace({ '\x0a': ' ', '\x0c':' ' }, excerpt)

	return excerpt

	def _get_index_file(self) :
	"""Returns the most recent index file found in the index directory"""

	if not os.path.isdir(self.absolute_index_dir):
	self.env.log.warning('index directory does not exist')
	return None

	# Read all the matching index.* files into a dictionary
	all = {}
	for item in dircache.listdir(self.absolute_index_dir) :
	path = os.path.join(self.absolute_index_dir, item)

	prefix_pattern = re.compile('^index\.swish-e\.(.*)$')
	prefix = prefix_pattern.match(item)
	if prefix :
	# Can be index.xxxx or index.xxxx.prop or index.xxxx.temp
	key = prefix.group(1)

	if re.compile('^.*\.temp$').match(key) :
	# Ignore files ending with *.temp
	break

	if not re.compile('^.*\.prop$').match(key):
	# This is an index file ...
	# ... add last modification time
	all[path] = os.path.getmtime(path)


	# Do we have indexes in the 'all' dictionary?
	if not all:
	self.env.log.warning(
	'%s: %s', os.path.basename(self.env.path),
	'attachments/index does not contain any index file')
	return None

	# Sort the indexes dictionary by increasing value
	sorted_dict = list(all.iteritems())
	sorted_dict.sort(lambda i1, i2: cmp(i1[1], i2[1]))

	# Get last tuple
	last_index = sorted_dict.pop()
	return os.path.join(self.absolute_index_dir, last_index[0])

	def _build_index(self, conf_file='swish.config', background=True):
	"""Build a new index by lauching the build_index script.
	Runs as a background process if 'background' is True(the default).
	"""

	self.env.log.info('Starting index update')
	cmd = '%s "%s" index -s "%s" -c' % (self.seat, self.env.path, self.swish)
	self.env.log.debug(cmd)
	if background == True:
	# Script is launched as a background process
	# ... so we'll return immediately
	os.system(cmd + ' &')
	else:
	os.system(cmd)
	return


	def _create_metafile(self, attachment) :
	"""Create a metafile(ie. a text version) for the attachement."""
	"""Only extensions with a filter.* command defined in trac.ini will be processed."""

	meta_file = attachment.path + '.meta'
	extension = self._get_extension(attachment)

	if not extension:
	msg = '.meta not created for %s (cannot determine file type)' % attachment.filename
	self.env.log.error(msg)
	return False

	self.env.log.debug('Uploaded file with extension = %s' % extension)

	# Iterate over filter.* in the [attachment] section of the config file
	# to build a dictionary of extension:command
	filters = { 'txt' : 'cp -rf "%s" "%s"' }
	for entry in self.config['searchattachments'] :
	p = re.compile('filter.([a-zA-Z0-9]*)')
	m = p.match(entry)
	if m:
	format = m.group(1).lower()
	filters[format] = self.config['searchattachments'].get(entry)

	# Is there a known filter for this extension?
	if extension in filters :

	# Converting to text with the command defined for this extension
	if os.path.exists(meta_file):
	os.remove(meta_file)

	cmd = filters[extension] % (attachment.path, meta_file)
	self.env.log.debug("Converting to text: %s" % cmd)
	error, output = commands.getstatusoutput(cmd)

	# Command sucessful?
	if error:
	self.env.log.error('Error when converting .%s to ext :: %s' % (extension, output))
	return False
	else:
	self.env.log.info('File %s succesfully converted ot text' % attachment.filename)
	return True
	# Unknown extension
	else:
	self.env.log.warn('No filter defined for this extension. Add a filter.%s entry in trac.ini' % extension)
	return False

	return False

	def _get_extension(self, attachment):

	pattern = re.compile('^.\.([a-zA-Z0-9])$')
	suffix = pattern.match(attachment.filename.strip())

	if suffix:
	ext = suffix.group(1).lower()
	# Synonyms for txt
	if ext in [ 'txt', 'text', 'xml' ]:
	return 'txt'
	else:
	return ext

	# No suffix...may be a text file
	elif is_text_file(attachment.path):
	return 'txt'

	# Cannot determine type
	else:
	return False


	##########################
	# Some utility functions #
	##########################

	def is_text(content):
	"""Determines if the content is text"""

	text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
	_null_trans = string.maketrans("", "")

	if "\0" in content:
	return False

	if not content: # Empty files are considered text
	return True

	# Get the non-text "binary" characters
	binary = content.translate(_null_trans, text_characters)

	# If more than 30% non-text characters, then
	# this is considered a binary file
	if len(binary) <= len(content) * 0.3:
	return True

	return False

	def is_text_file(filepath, blocksize = 512):
	"""Read the first 512 bytes of the specified file.
	Returns True if it's a text file, False otherwise.
	"""

	if is_text(open(filepath).read(blocksize)):
	return True
	else:
	return False

	def multiple_replace(dict, text):
	"""Replace in 'text', all occurences of keys specified in the input
	dictionary with its corresponding value. Returns the new string.
	"""

	# Create a regular expression from the dictionary keys
	regex = re.compile("(%s)" % "\|".join(map(re.escape, dict.keys())))

	# For each match, look-up corresponding value in dictionary
	return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)