isomorphisms · September 15, 2011 19:21
diff --git a/gistfile1.pl b/gistfile1.pl
 init Score;

 ##### most telling features #######

 Score += const * 10th_decile( vector_of_likes$Date ); 

 Score += const * another_functional_of( vector_of_likes$Date );

 Score += const * number of likes;   #interaction term with % reblogs / number of written posts ... if the written posts start being hacked from somewhere else there are ways to identify that too

 Score += const * avg_speed_of like_submission;


 Score += const * is_there_an_ad_on_the_blog;


 ######



 ##### most telling feature overall: hashtag squatting #####

 @hashtags = qw( tags_on most_recently_liked_post );
 init squattercount;

 foreach (@hashtags) {
    squattercount = number_of_likes_with the_same_hashtag;
    }


 Score += const *  squattercount ;
 #do this for a few randomly spread Likes

 #the squattercount is the best screening mechanism, i.e. if you're figuring out which accounts to run this regression on





 ###### names and photos are telling #########

 run SVM on photo;

 run SVM on name;

 if (the kernelised photo is north of the hyperplane) { Score += const; }

 if (the kernelised name is north of the hyperplane) { Score += const; }

 ######


 ###### pure laziness play ##########

 if (TumblrMessage == default "I love here :)") { Score += const; }
 #interaction terms with default message + every other term
 #interaction terms with default title + every other term

 Score += concave_fn( percent of posts that are reblogs ) * const;
 #actually the fn. should be nearly linear up to 80% and then starkly nonlinear monotonic, just so long as the highest %'s are differentiated




 ########## put it all together #######

 Do a median-polish regression on the logistic of {spam, not_spam} to get decent values for the constants.
	init Score;

	##### most telling features #######

	Score += const * 10th_decile( vector_of_likes$Date );

	Score += const * another_functional_of( vector_of_likes$Date );

	Score += const * number of likes; #interaction term with % reblogs / number of written posts ... if the written posts start being hacked from somewhere else there are ways to identify that too

	Score += const * avg_speed_of like_submission;


	Score += const * is_there_an_ad_on_the_blog;


	######



	##### most telling feature overall: hashtag squatting #####

	@hashtags = qw( tags_on most_recently_liked_post );
	init squattercount;

	foreach (@hashtags) {
	squattercount = number_of_likes_with the_same_hashtag;
	}


	Score += const * squattercount ;
	#do this for a few randomly spread Likes

	#the squattercount is the best screening mechanism, i.e. if you're figuring out which accounts to run this regression on





	###### names and photos are telling #########

	run SVM on photo;

	run SVM on name;

	if (the kernelised photo is north of the hyperplane) { Score += const; }

	if (the kernelised name is north of the hyperplane) { Score += const; }

	######


	###### pure laziness play ##########

	if (TumblrMessage == default "I love here :)") { Score += const; }
	#interaction terms with default message + every other term
	#interaction terms with default title + every other term

	Score += concave_fn( percent of posts that are reblogs ) * const;
	#actually the fn. should be nearly linear up to 80% and then starkly nonlinear monotonic, just so long as the highest %'s are differentiated




	########## put it all together #######

	Do a median-polish regression on the logistic of {spam, not_spam} to get decent values for the constants.