Created
June 23, 2009 23:26
-
-
Save mrflip/134909 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'wuclan/models/tweet/tweet_token' | |
require 'wukong/encoding' | |
module Wuclan::Models | |
Tweet.class_eval do | |
def string_for_tokenizing | |
# simpleminded test for non-latin script: don't bother if > 20 entities | |
return if (text.count('&') > 20) | |
# skip default message from early days | |
return if (text =~ /just setting up my twttr/); | |
# return decoded, whitespace-flattened text | |
self.decoded_text.gsub(/\s+/s, ' ').strip | |
end | |
def tokens_for klass, str | |
klass.extract_tokens!(str).map do |word| | |
klass.new(word, twitter_user_id, id, 1) | |
end | |
end | |
def tokenize extract_word_tokens=nil | |
str = string_for_tokenizing | |
return [] if str.blank? | |
toks = [] | |
# Case-sensitive tokens | |
[ SmilieToken, UrlToken ].each do |klass| | |
toks += tokens_for klass, str | |
end | |
# Case-insensitive tokens | |
str.downcase! | |
[ RtToken, AtsignToken, HashtagToken ].each do |klass| # , | |
toks += tokens_for klass, str | |
end | |
toks += tokens_for WordToken, str if extract_word_tokens | |
toks | |
end | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
module Wuclan | |
module Models | |
module TweetRegexes | |
# =========================================================================== | |
# | |
# Twitter accepts URLs somewhat idiosyncratically, probably for good reason -- | |
# we rarely see ()![] in urls; more likely in a status they are punctuation. | |
# | |
# This is what I've reverse engineered. | |
# | |
# | |
# Notes: | |
# | |
# * is.gd uses a trailing '-' (to indicate 'preview mode'): clever. | |
# * pastoid.com uses a trailing '+', and idek.net a trailing ~ for no reason. annoying. | |
# | |
# Counterexamples: | |
# * http://www.5irecipe.cn/recipe_content/2307/'/ | |
# * http://www.facebook.com/groups.php?id=1347199977&gv=12#/group.php?gid=18183539495 | |
# | |
RE_DOMAIN_HEAD = '(?:[a-zA-Z0-9\-]+\.)+' | |
RE_DOMAIN_TLD = '(?:com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum|[a-zA-Z]{2})' | |
# RE_URL_SCHEME = '[a-zA-Z][a-zA-Z0-9\-\+\.]+' | |
RE_URL_SCHEME_STRICT = '[a-zA-Z]{3,6}' | |
RE_URL_UNRESERVED = 'a-zA-Z0-9' + '\-\._~' | |
RE_URL_OKCHARS = RE_URL_UNRESERVED + '\'\+\,\;=' + '/%:@' # not !$&()* [] \| | |
RE_URL_QUERYCHARS = RE_URL_OKCHARS + '&=' | |
RE_URL_HOSTPART = "#{RE_URL_SCHEME_STRICT}://#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}" | |
RE_URL = %r{( | |
#{RE_URL_HOSTPART} # Host | |
(?:(?: \/ [#{RE_URL_OKCHARS}]+? )*? # path: / delimited path segments | |
(?: \/ [#{RE_URL_OKCHARS}]*[\w\-\+\~] ) # where the last one ends in a non-punctuation. | |
| # ... or no path segment | |
)\/? # with an optional trailing slash | |
(?: \? [#{RE_URL_QUERYCHARS}]+ )? # query: introduced by a ?, with &foo= delimited segments | |
(?: \# [#{RE_URL_OKCHARS}]+ )? # frag: introduced by a # | |
)}x | |
# | |
# Technically a scheme can allow the characters '+', '-' and '.' within | |
# it. In practice you can not only ignore those characters but all but a | |
# few specific schemes. | |
# | |
# From a collection of ~9M tweeted urls, 99.4% were http://, with only the additional | |
# https, mms, ftp, git, irc, feed, itpc, rtsp, hxxp, gopher, telnet, itms, ssh, webcal, svn | |
# seemingly worth finding: | |
# | |
# 8925742 http | |
# 6026 https 1841 ivo 122 mms 85 ftp 61 git 53 irc 45 feed 31 itpc 12 www | |
# 12 rtsp 12 hxxp 12 gopher 9 telnet 9 itms 7 ssh 5 webcal 5 sop 4 wiie | |
# 3 svn 3 sssp 3 file 2 res 1 xttp 1 xmlrpc 1 ssl 1 smb | |
# | |
# An hxxp http://en.wikipedia.org/wiki/Hxxp is used to obscure a link, so | |
# take of that what you may. | |
# | |
# The ivo:// scheme is used by virtual astronomical observatories; as its | |
# hostnames are given in reverse-dotted notation (uk.org.estar) these URIs | |
# are imperfectly recognized. Twitter doesn't accept them at all: | |
# http://twitter.com/eSTAR_Project/status/1113930948 | |
# | |
# | |
# =========================================================================== | |
# | |
# A hash following a non-alphanum_ (or at the start of the line | |
# followed by (any number of alpha, num, -_.+:=) and ending in an alphanum_ | |
# | |
# This is overly generous to those dorky triple tags (geo:lat=69.3), but we'll soldier on somehow. | |
# | |
RE_HASHTAGS = %r{(?:^|\W)\#([a-zA-Z0-9\-_\.+:=]+\w)(?:\W|$)} | |
# =========================================================================== | |
# | |
# Retweets and Retweet Whores | |
# | |
# See ARetweetsB for more info. | |
# | |
# A retweet | |
# RT @interesting_user Something so witty Dorothy Parker would just give up | |
# Oh yeah and so's your mom (via @sixth_grader) | |
# retweeting @ogre: KEGGER TONITE RT pls | |
# ^^^ this is not a rtwhore; it matches first as a retweet | |
# | |
# and rtwhores | |
# retweet please: Hey here's something I'm whoring xxx | |
# KEGGER TONITE RT pls | |
# | |
# or semantically-incorrect matches such as (actual example): | |
# @somebody lol, love the 'please retweet' ending! | |
# | |
# Things that don't match: | |
# retweet is silly, @i_think_youre_dumb | |
# misspell the name of my Sony Via | |
# | |
RE_RETWEET_WORDS = 'rt|retweet|retweeting' | |
RE_RETWEET_ONLY = %r{(?:#{RE_RETWEET_WORDS})} | |
RE_RETWEET_OR_VIA = %r{(?:#{RE_RETWEET_WORDS}|via|from)} | |
RE_PLEASE = %r{(?:please|plz|pls)} | |
RE_RETWEET = %r{\b#{RE_RETWEET_OR_VIA}\W*@(\w+)\b}i | |
RE_RTWHORE = %r{ | |
\b#{RE_RETWEET_ONLY}\W*#{RE_PLEASE}\b | |
| \b#{RE_PLEASE}\W*#{RE_RETWEET_ONLY}\b}ix | |
# =========================================================================== | |
# | |
# following either the start of the line, or a non-alphanum_ character | |
# the string of following [a-zA-Z0-9_] | |
# | |
# Note carefully: we _demand_ a preceding character (or start of line): | |
# \b would match [email protected], which we don't want. | |
# | |
# Making an exception for RT@im_cramped_for_space. | |
# | |
# All retweets | |
# | |
RE_ATSIGNS = %r{(?:^|\W|#{RE_RETWEET_OR_VIA})@(\w+)\b} | |
# =========================================================================== | |
# | |
# Smilies !!! ^_^ | |
# | |
# RE_NUMBERS = %r{ | |
# (?:^|\D) # non-number | |
# ( | |
# |(?:\(\d{3}\)[\ \-]?\d{3}[\ \-]\d{4}) | |
# |(?: (?:\d{1,3}\.)(?:\d{3},)*\.?\d+) # decimal number | |
# |(?: (?:\d{1,3}\.)(?:\d{3}\.)*,?\d+) # euro-style | |
# \d+ | |
# ) | |
# }x | |
# | |
# # IP address | |
# \b(?:\d{1,3}\.){3}\d{1,3}\b | |
# credit card: (lax) | |
# \b(?:\d[ -]*){13,16}\b | |
# \b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})\b | |
# | |
# [-+]?[0-9,]*\.?[0-9]* | |
# [-+]?[0-9]*(\.[0-9]+)?([eE][-+]?[0-9]+)? | |
# =========================================================================== | |
# | |
# Smilies !!! ^_^ | |
# | |
RE_SMILIES_EYES = "\\:8;" | |
RE_SMILIES_NOSE = "\\-=\\*o" | |
RE_SMILIES_MOUTH = "DP@Oo\\(\\)\\[\\]\\|\\{\\}\\/\\\\" | |
RE_SMILIES = %r{ | |
(?:^|\W) # non-smilie character | |
( (?: | |
>? | |
[#{RE_SMILIES_EYES}] # eyes | |
[#{RE_SMILIES_NOSE}]? # nose, maybe | |
[#{RE_SMILIES_MOUTH}] ) # mouth | |
|(?: | |
[#{RE_SMILIES_MOUTH}] # mouth | |
[#{RE_SMILIES_NOSE}]? # nose, maybe | |
[#{RE_SMILIES_EYES}] # eyes | |
<? ) | |
|(?: =[#{RE_SMILIES_MOUTH}]) # =) (= | |
|(?: [#{RE_SMILIES_MOUTH}]=) # =) (= | |
|(?: \^[_\-]\^ ) # kawaaaaiiii! | |
|(?: :[,\']\( ) # snif | |
|(?: <3 ) # heart | |
|(?: \\m/ ) # rawk | |
|(?: x-\( ) # dead | |
) | |
(?:\W|$) | |
}x | |
end | |
end | |
end | |
# http://mail.google.com/support/bin/answer.py?hl=en&answer=34056 | |
# http://en.wikipedia.org/wiki/Emoticons | |
# | |
# :-) :) =] =) Smiling, happy | |
# :-( =( :[ :< frowning, Sad | |
# ;-) ;) ;] Wink | |
# :D =D XD BD Large grin or laugh | |
# :P =P XP Tongue out, or after a joke | |
# <3 S2 :> Love | |
# :O =O Shocked or surprised | |
# =I :/ :-\ Bored, annoyed or awkward; concerned. | |
# :S =S :? Confused, embarrassed or uneasy | |
# Icon Meaning Icon Meaning Icon Meaning | |
# (^_^) smile (^o^) laughing out loud d(^_^)b thumbs up (not ears) | |
# (T_T) sad (crying face) (-.-)Zzz sleeping (Z.Z) sleepy person | |
# \(^_^)/ cheers, "Hurrah!" (*^^*) shyness (-_-); sweating (as in ashamed), or exasperated. | |
# (*3*) "Surprise !." (?_?) "Nonsense, I don't know." (^_~) wink | |
# (o.O) shocked/disturbed (<.<) shifty, suspicious v(^_^)v peace | |
# | |
# [\\dv](^_^)[bv/] | |
# |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'active_support/core_ext/class/inheritable_attributes.rb' | |
require 'wuclan/models/tweet/tweet_regexes' | |
module Wuclan::Models | |
class TweetToken < TypedStruct.new( | |
[:word, String], | |
[:user_id, Integer], | |
[:tweet_id, Integer], | |
[:freq, Integer] | |
) | |
include ModelCommon | |
include TweetRegexes | |
class_inheritable_accessor :extract_re | |
def initialize *args | |
super *args | |
freq = 1 if freq.blank? && (! word.blank?) | |
end | |
def num_key_fields() 5 end | |
def numeric_id_fields() [] ; end | |
# crawl through the string | |
# remove each token, leave a space behind | |
def self.extract_tokens! str | |
toks = [] | |
str.gsub!(extract_re){|tok| toks << $1.strip ; ' ' } | |
toks | |
end | |
end | |
class SmilieToken < TweetToken | |
self.extract_re = RE_SMILIES | |
end | |
class UrlToken < TweetToken | |
self.extract_re = RE_URL | |
end | |
class RtToken < TweetToken | |
self.extract_re = RE_RETWEET | |
def self.extract_tokens! str | |
super.map{|str| str = 'RT_@'+str } | |
end | |
end | |
class AtsignToken < TweetToken | |
self.extract_re = RE_ATSIGNS | |
def self.extract_tokens! str | |
super.map{|str| str = '@'+str } | |
end | |
end | |
class HashtagToken < TweetToken | |
self.extract_re = RE_HASHTAGS | |
def self.extract_tokens! str | |
super.map{|str| str = '#'+str } | |
end | |
end | |
class WordToken < TweetToken | |
self.extract_re = nil | |
# | |
# This is pretty simpleminded. | |
# | |
# returns all words of three or more letters. | |
# * terminal 't and 's (as in "don't" and "it's") are tokenised together | |
# * | |
# | |
# * FIXME -- this doesn't leave str as blank, as it should to behave like | |
# the other ! methods | |
def self.extract_tokens! str | |
return [] unless str | |
str = str.downcase; | |
# kill off all punctuation except 's | |
# this includes hyphens (words are split) | |
str = str.gsub(/[^\w\'@]+/, ' ').gsub(/\'([st])\b/, '!\1').gsub(/\'/, ' ').gsub(/!/, "'") | |
# Busticate at whitespace | |
words = str.strip.split(/\s+/) | |
# | |
words.reject{|w| w.blank? || (w.length < 3) } | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment