vad · September 7, 2010 09:28
diff --git a/Signature finder b/Signature finder
 ## From http://code.google.com/p/soxred93tools/source/browse/trunk/web/rfap/rfalib3.php       
 /*
 RfA Analysis Library 2.05
 This version breaks compatibility with the 1.x series
 Copyright (C) 2006 Tangotango (tangotango.wp _at_ gmail _dot_ com)

 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */ 

 "/\[\[[Uu]ser(?:[\s_][Tt]alk)?\:([^\]\|\/]*)(?:\|[^\]]*)?\]\]"                                     //1: Normal [[User:XX]] and [[User talk:XX]]
        ."|\{\{(?:[Ff]ullurl\:[Uu]ser(?:[\s_][Tt]alk)?\:|[Uu]nsigned\|)([^\}\|]*)(?:|[\|\}]*)?\}\}"    //2: {{fullurl}} and {{unsigned}} templates
        ."|(?:\{\{)[Uu]ser(?:[\s_][Tt]alk)?\:([^\}\/\|]*)"                                                  //3: {{User:XX/sig}} templates
        ."|\{\{[Uu]nsigned2\|[^\|]*\|([^\}]*)\}\}"                                                                //4: {{unsigned2|Date|XX}} templates
        ."|(?:\[\[)[Uu]ser\:([^\]\/\|]*)\/[Ss]ig[\|\]]/"                                                        //5: [[User:XX/sig]] links (compromise measure)

 ## Altra regexp
         "/\[\[[Uu]ser(?:[\s_][Tt]alk)?\:([^\]\/\|]*)"        //5: "[[User:XX/PageAboutMe" links (notice no end tag)
        ."|\[\[[Ss]pecial\:[Cc]ontributions\/([^\|\]]*)/"
diff --git a/smile_with_context.py b/smile_with_context.py
 #!/usr/bin/env python

 ##########################################################################
 #                                                                        #
 #  This program is free software; you can redistribute it and/or modify  #
 #  it under the terms of the GNU General Public License as published by  #
 #  the Free Software Foundation; version 2 of the License.               #
 #                                                                        #
 #  This program is distributed in the hope that it will be useful,       #
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of        #
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
 #  GNU General Public License for more details.                          #
 #                                                                        #
 ##########################################################################

 ## etree
 from lxml import etree

 from bz2 import BZ2File
 import sys
 #import cProfile as profile
 from functools import partial
 import logging
 import re
 from collections import defaultdict

 ## multiprocessing
 from multiprocessing import Pipe, Process

 from sonet.graph import load as sg_load
 from sonet import lib
 import sonet.mediawiki as mwlib

 ## nltk
 import nltk

 count_utp, count_missing = 0, 0
 lang_user, lang_user_talk = None, None
 tag = {}
 en_user, en_user_talk = u"User", u"User talk"
 user_classes = None

 ## frequency distribution

 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

 ### CHILD PROCESS

 # smile dictionary
 dsmile = {
    'happy': (r':[ -]?[)\]>]', r'=[)\]>]', r'\^[_\- .]?\^', 'x\)', r'\(^_^\)'),
    'sad': (r':[\- ]?[(\[<]', r'=[(\[<]'),
    'laugh': (r':[ -]?D', '=D'),
    'tongue': (':-?[pP]', '=[pP]', 'xP'),
    'normal': (r':[\- ]?\|',),
    'cool': (r'8[\- ]?\)',),
 }

 def build_smile_re(dsmile):
    out = {}
    for name, lsmile in dsmile.items():
        out[name] = re.compile(r'(?:(?:\s|^)%s)' % (r'|(?:\s|^)'.join(lsmile)))

    return out

 re_smile = build_smile_re(dsmile)

 ## r argument is just for caching
 def remove_templates(text, r=re.compile(r"{{.*?}}")):
    """
    Remove Mediawiki templates from given text:

    >>> remove_templates("hello{{template}} world")
    'hello world'
    >>> remove_templates("hello{{template}} world{{template2}}")
    'hello world'
    """
    return r.sub("", text)


 def find_smiles(text, dsmile=dsmile):
    """
    Find smiles in text and returns a dictionary of found smiles

    >>> find_smiles(':) ^^')
    {'happy': 2}
    >>> find_smiles('^^')
    {'happy': 1}
    >>> find_smiles(' :|')
    {'normal': 1}
    """
    res = {}
    for name, lsmile in dsmile.items():
        regex_smile = r'(?:(?:\s|^)%s)' % (r'|(?:\s|^)'.join(lsmile))
        matches = len([1 for match in re.findall(regex_smile, text)
                         if match])
        for match in re.finditer(regex_smile, text):
            print 'sonetsmile: ', text[max(0, match.start()-10):match.end()+10]

        if matches:
            res[name] = matches

    return dict(res)

 def get_freq_dist(recv, send, fd=None, dcount_smile=None, classes=None):
    """
    Find word frequency distribution and count smile in the given text.

    Parameters
    ----------
    recv : multiprocessing.Connection
        Read only
    send : multiprocessing.Connection
        Write only
    fd : dict
        Word frequency distributions
    dcount_smile : dict
        Smile counters
    """
    from operator import itemgetter
    from collections import Counter
    stopwords = frozenset(
        nltk.corpus.stopwords.words('italian')
        ).union(
            frozenset("[]':,(){}.?!*\"")
        ).union(
            frozenset(("==", "--"))
        )
    tokenizer = nltk.PunktWordTokenizer()

    if not classes:
        classes = ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal user',
                   'all')

    # prepare a dict of empty FreqDist, one for every class
    if not fd:
        fd = dict([(cls, nltk.FreqDist()) for cls in classes])
    if not dcount_smile:
        dcount_smile = dict([(cls, Counter()) for cls in classes])

    while 1:
        try:
            cls, msg = recv.recv()
        except TypeError: ## end
            send.send([(cls, sorted(freq.items(),
                                    key=itemgetter(1),
                                    reverse=True)[:1000])
                       for cls, freq in fd.iteritems()])
            send.send([(cls, sorted(counters.items(),
                                    key=itemgetter(1),
                                    reverse=True))
                       for cls, counters in dcount_smile.iteritems()])

            return

        msg = remove_templates(msg)

        ## TODO: update 'all' just before sending by summing the other fields
        count_smile = find_smiles(msg)
        dcount_smile[cls].update(count_smile)
        dcount_smile['all'].update(count_smile)

        tokens = tokenizer.tokenize(nltk.clean_html(msg.encode('utf-8')
                                                        .lower()))

        text = nltk.Text(t for t in tokens if t not in stopwords)
        fd[cls].update(text)
        fd['all'].update(text)


 #def get_freq_dist_wrapper(q, done_q, fd=None):
 #    profile.runctx("get_freq_dist(q, done_q, fd)",
 #        globals(), locals(), 'profile')


 ### MAIN PROCESS

 def get_class(g, cls):
    if cls == 'all':
        users = g.g.vs
    elif cls == 'normal user':
        users = g.g.vs.select(**{'bot_ne': True, 'anonymous_ne': True,
                                 'sysop_ne': True,
                                 'bureaucrat_ne': True})
    else:
        users = g.g.vs.select(**{cls: True})
    return users

 def process_page(elem, send):
    """
    send is a Pipe connection, write only
    """
    user = None
    global count_utp, count_missing

    for child in elem:
        if child.tag == tag['title'] and child.text:
            title = child.text

            try:
                user = mwlib.username_from_utp(title,
                                               (en_user_talk, lang_user_talk))
            except ValueError:
                return
        elif child.tag == tag['revision']:
            for rc in child:
                if rc.tag != tag['text']:
                    continue

                #assert user, "User still not defined"
                if not (rc.text and user):
                    continue

                user = user.encode('utf-8')
                try:
                    if user_classes[user] == 'sysop':
                        print '-----------', title.encode('utf-8'), '-----------'
                        print find_smiles(rc.text.encode('utf-8'))
                        #send.send((user_classes[user], rc.text))
                except KeyError:
                    ## fix for anonymous users not in the rich file
                    if mwlib.isip(user):
                        pass
                        #send.send(('anonymous', rc.text))
                    else:
                        logging.warn("Exception with user %s" % (user,))
                        count_missing += 1

                count_utp += 1

                if not count_utp % 500:
                    print >> sys.stderr, count_utp


 def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle"
    )

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(g.get_user_class('username',
                                  ('anonymous', 'bot', 'bureaucrat','sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (probably this cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d' % (count_missing,))

    p_sender.send(0) ## this STOPS the process

    print >> sys.stderr, "end of parsing"
    sys.exit(0)

    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open("%swiki-%s-words-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
    del fd

    for cls, counters in done_p_receiver.recv():
        with open("%swiki-%s-smile-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k

    p.join()

    print >> sys.stderr, "end of FreqDist"


 if __name__ == "__main__":
    #import cProfile as profile
    #profile.run('main()', 'mainprof')
    main()
	## From http://code.google.com/p/soxred93tools/source/browse/trunk/web/rfap/rfalib3.php
	/*
	RfA Analysis Library 2.05
	This version breaks compatibility with the 1.x series
	Copyright (C) 2006 Tangotango (tangotango.wp _at_ gmail _dot_ com)

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
	*/

	"/\[\[[Uu]ser(?:[\s_][Tt]alk)?\:([^\]\\|\/])(?:\\|[^\]])?\]\]" //1: Normal [[User:XX]] and [[User talk:XX]]
	."\|\{\{(?:[Ff]ullurl\:[Uu]ser(?:[\s_][Tt]alk)?\:\|[Uu]nsigned\\|)([^\}\\|])(?:\|[\\|\}])?\}\}" //2: {{fullurl}} and {{unsigned}} templates
	."\|(?:\{\{)[Uu]ser(?:[\s_][Tt]alk)?\:([^\}\/\\|]*)" //3: {{User:XX/sig}} templates
	."\|\{\{[Uu]nsigned2\\|[^\\|]\\|([^\}])\}\}" //4: {{unsigned2\|Date\|XX}} templates
	."\|(?:\[\[)[Uu]ser\:([^\]\/\\|]*)\/[Ss]ig[\\|\]]/" //5: [[User:XX/sig]] links (compromise measure)

	## Altra regexp
	"/\[\[[Uu]ser(?:[\s_][Tt]alk)?\:([^\]\/\\|]*)" //5: "[[User:XX/PageAboutMe" links (notice no end tag)
	."\|\[\[[Ss]pecial\:[Cc]ontributions\/([^\\|\]]*)/"
	#!/usr/bin/env python

	##########################################################################
	# #
	# This program is free software; you can redistribute it and/or modify #
	# it under the terms of the GNU General Public License as published by #
	# the Free Software Foundation; version 2 of the License. #
	# #
	# This program is distributed in the hope that it will be useful, #
	# but WITHOUT ANY WARRANTY; without even the implied warranty of #
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
	# GNU General Public License for more details. #
	# #
	##########################################################################

	## etree
	from lxml import etree

	from bz2 import BZ2File
	import sys
	#import cProfile as profile
	from functools import partial
	import logging
	import re
	from collections import defaultdict

	## multiprocessing
	from multiprocessing import Pipe, Process

	from sonet.graph import load as sg_load
	from sonet import lib
	import sonet.mediawiki as mwlib

	## nltk
	import nltk

	count_utp, count_missing = 0, 0
	lang_user, lang_user_talk = None, None
	tag = {}
	en_user, en_user_talk = u"User", u"User talk"
	user_classes = None

	## frequency distribution

	logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

	### CHILD PROCESS

	# smile dictionary
	dsmile = {
	'happy': (r':[ -]?[)\]>]', r'=[)\]>]', r'\^[_\- .]?\^', 'x\)', r'\(^_^\)'),
	'sad': (r':[\- ]?[(\[<]', r'=[(\[<]'),
	'laugh': (r':[ -]?D', '=D'),
	'tongue': (':-?[pP]', '=[pP]', 'xP'),
	'normal': (r':[\- ]?\\|',),
	'cool': (r'8[\- ]?\)',),
	}

	def build_smile_re(dsmile):
	out = {}
	for name, lsmile in dsmile.items():
	out[name] = re.compile(r'(?:(?:\s\|^)%s)' % (r'\|(?:\s\|^)'.join(lsmile)))

	return out

	re_smile = build_smile_re(dsmile)

	## r argument is just for caching
	def remove_templates(text, r=re.compile(r"{{.*?}}")):
	"""
	Remove Mediawiki templates from given text:

	>>> remove_templates("hello{{template}} world")
	'hello world'
	>>> remove_templates("hello{{template}} world{{template2}}")
	'hello world'
	"""
	return r.sub("", text)


	def find_smiles(text, dsmile=dsmile):
	"""
	Find smiles in text and returns a dictionary of found smiles

	>>> find_smiles(':) ^^')
	{'happy': 2}
	>>> find_smiles('^^')
	{'happy': 1}
	>>> find_smiles(' :\|')
	{'normal': 1}
	"""
	res = {}
	for name, lsmile in dsmile.items():
	regex_smile = r'(?:(?:\s\|^)%s)' % (r'\|(?:\s\|^)'.join(lsmile))
	matches = len([1 for match in re.findall(regex_smile, text)
	if match])
	for match in re.finditer(regex_smile, text):
	print 'sonetsmile: ', text[max(0, match.start()-10):match.end()+10]

	if matches:
	res[name] = matches

	return dict(res)

	def get_freq_dist(recv, send, fd=None, dcount_smile=None, classes=None):
	"""
	Find word frequency distribution and count smile in the given text.

	Parameters
	----------
	recv : multiprocessing.Connection
	Read only
	send : multiprocessing.Connection
	Write only
	fd : dict
	Word frequency distributions
	dcount_smile : dict
	Smile counters
	"""
	from operator import itemgetter
	from collections import Counter
	stopwords = frozenset(
	nltk.corpus.stopwords.words('italian')
	).union(
	frozenset("[]':,(){}.?!*\"")
	).union(
	frozenset(("==", "--"))
	)
	tokenizer = nltk.PunktWordTokenizer()

	if not classes:
	classes = ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal user',
	'all')

	# prepare a dict of empty FreqDist, one for every class
	if not fd:
	fd = dict([(cls, nltk.FreqDist()) for cls in classes])
	if not dcount_smile:
	dcount_smile = dict([(cls, Counter()) for cls in classes])

	while 1:
	try:
	cls, msg = recv.recv()
	except TypeError: ## end
	send.send([(cls, sorted(freq.items(),
	key=itemgetter(1),
	reverse=True)[:1000])
	for cls, freq in fd.iteritems()])
	send.send([(cls, sorted(counters.items(),
	key=itemgetter(1),
	reverse=True))
	for cls, counters in dcount_smile.iteritems()])

	return

	msg = remove_templates(msg)

	## TODO: update 'all' just before sending by summing the other fields
	count_smile = find_smiles(msg)
	dcount_smile[cls].update(count_smile)
	dcount_smile['all'].update(count_smile)

	tokens = tokenizer.tokenize(nltk.clean_html(msg.encode('utf-8')
	.lower()))

	text = nltk.Text(t for t in tokens if t not in stopwords)
	fd[cls].update(text)
	fd['all'].update(text)


	#def get_freq_dist_wrapper(q, done_q, fd=None):
	# profile.runctx("get_freq_dist(q, done_q, fd)",
	# globals(), locals(), 'profile')


	### MAIN PROCESS

	def get_class(g, cls):
	if cls == 'all':
	users = g.g.vs
	elif cls == 'normal user':
	users = g.g.vs.select(**{'bot_ne': True, 'anonymous_ne': True,
	'sysop_ne': True,
	'bureaucrat_ne': True})
	else:
	users = g.g.vs.select(**{cls: True})
	return users

	def process_page(elem, send):
	"""
	send is a Pipe connection, write only
	"""
	user = None
	global count_utp, count_missing

	for child in elem:
	if child.tag == tag['title'] and child.text:
	title = child.text

	try:
	user = mwlib.username_from_utp(title,
	(en_user_talk, lang_user_talk))
	except ValueError:
	return
	elif child.tag == tag['revision']:
	for rc in child:
	if rc.tag != tag['text']:
	continue

	#assert user, "User still not defined"
	if not (rc.text and user):
	continue

	user = user.encode('utf-8')
	try:
	if user_classes[user] == 'sysop':
	print '-----------', title.encode('utf-8'), '-----------'
	print find_smiles(rc.text.encode('utf-8'))
	#send.send((user_classes[user], rc.text))
	except KeyError:
	## fix for anonymous users not in the rich file
	if mwlib.isip(user):
	pass
	#send.send(('anonymous', rc.text))
	else:
	logging.warn("Exception with user %s" % (user,))
	count_missing += 1

	count_utp += 1

	if not count_utp % 500:
	print >> sys.stderr, count_utp


	def main():
	import optparse

	p = optparse.OptionParser(
	usage="usage: %prog [options] dump enriched_pickle"
	)

	_, args = p.parse_args()

	if len(args) != 2:
	p.error("Too few or too many arguments")
	xml, rich_fn = args

	global lang_user_talk, lang_user, tag, user_classes
	## pipe to send data to the subprocess
	p_receiver, p_sender = Pipe(duplex=False)
	## pipe to get elaborated data from the subprocess
	done_p_receiver, done_p_sender = Pipe(duplex=False)

	src = BZ2File(xml)

	tag = mwlib.get_tags(src)
	lang, date, _ = mwlib.explode_dump_filename(xml)
	g = sg_load(rich_fn)
	user_classes = dict(g.get_user_class('username',
	('anonymous', 'bot', 'bureaucrat','sysop')))

	p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
	p.start()

	translations = mwlib.get_translations(src)
	lang_user, lang_user_talk = translations['User'], translations['User talk']

	assert lang_user, "User namespace not found"
	assert lang_user_talk, "User Talk namespace not found"

	## open with a faster decompressor (probably this cannot seek)
	src.close()
	src = lib.BZ2FileExt(xml)

	partial_process_page = partial(process_page, send=p_sender)
	mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
	partial_process_page)
	logging.info('Users missing in the rich file: %d' % (count_missing,))

	p_sender.send(0) ## this STOPS the process

	print >> sys.stderr, "end of parsing"
	sys.exit(0)

	g.set_weighted_degree()
	users_cache = {}
	# get a list of pair (class name, frequency distributions)
	for cls, fd in done_p_receiver.recv():
	with open("%swiki-%s-words-%s.dat" %
	(lang, date,
	cls.replace(' ', '_')), 'w') as out:
	# users in this group
	try:
	users = users_cache[cls]
	except KeyError:
	users = get_class(g, cls)
	users_cache[cls] = users
	print >> out, '#users: ', len(users)
	print >> out, '#msgs: ', sum(users['weighted_indegree'])
	for k, v in fd:
	print >> out, v, k
	del fd

	for cls, counters in done_p_receiver.recv():
	with open("%swiki-%s-smile-%s.dat" %
	(lang, date,
	cls.replace(' ', '_')), 'w') as out:
	# users in this group
	try:
	users = users_cache[cls]
	except KeyError:
	users = get_class(g, cls)
	users_cache[cls] = users
	print >> out, '#users: ', len(users)
	print >> out, '#msgs: ', sum(users['weighted_indegree'])
	for k, v in counters:
	print >> out, v, k

	p.join()

	print >> sys.stderr, "end of FreqDist"


	if __name__ == "__main__":
	#import cProfile as profile
	#profile.run('main()', 'mainprof')
	main()