Skip to content

Instantly share code, notes, and snippets.

View rohitdholakia's full-sized avatar

Rohit Dholakia rohitdholakia

View GitHub Profile
@rohitdholakia
rohitdholakia / LogitReg.py
Created May 18, 2013 20:51
Logit regression example
from sklearn import linear_model
from FeatureUtils import *
import numpy as np
(features,output) = getXy(sys.argv[1])
clf = linear_model.LogisticRegression(penalty='l1')
clf.fit(features,output)
print clf.coef_
@rohitdholakia
rohitdholakia / Snapshot.csv
Created May 18, 2013 15:56
Snapshot of the CSV file
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
37313 14 68 7 0 0 5 4 162 35 11 0 0
478335 9 14 2 0 0 2 5 70 1 24 0 0
8945 5 94 10 0 1 3 3 3395 239 524 67 0
8741 6 78 2 0 0 4 3 4784 938 564 17 0
8976 3 156 6 0 4 5 4 3778 757 1781 269 0
391411 6 90 1 0 0 2 1 686 32 150 1 0
762005 5 511 0 0 4 2 5 10 0 0 0 0
749982 5 169 0 0 0 1 4 1 0 0 0 0
316016 6 81 1 0 0 3 2 134 29 27 0 0
855171 6 567 1 0 0 1 2 87 0 19 0 0
@rohitdholakia
rohitdholakia / FeatureExtractor.py
Last active December 17, 2015 11:49
Feature Extraction
import types
import sys
import marshal
import Utils
from lxml import etree
import csv
class User:
#this is for user features
def __init__(self, userId, d):
self.reputation = d['reputation']
@rohitdholakia
rohitdholakia / UserDetails.py
Last active December 17, 2015 10:39
User Details SO
import sys
import os
import marshal
import Utils
from lxml import etree
if(len(sys.argv)<2):
print 'python name.py posts.xml acceptedIdsList users.xml userDictOutput'
sys.exit()
userdict = {}
@rohitdholakia
rohitdholakia / MostTags.py
Created May 14, 2013 06:57
Finding the most often occuring tags
#Find out the tags that are most often used with questions tagged famous
import sys
import os
import marshal
from lxml import etree
from collections import *
import operator
from Utils import *
if(len(sys.argv)<2):
print 'python name.py posts.xml tagDistributionDict'
@rohitdholakia
rohitdholakia / Utils.py
Created May 14, 2013 06:44
A snapshot of the utils file
def getCreationDate(elem):
return elem.get("CreationDate")
def getParent(elem):
return elem.get("ParentId")
def getPostTypeId(elem):
return elem.get("PostTypeId")
def getOwner(elem):
#num non-hashes hashes
1 37.5968768597 38.9962208271
4 39.5982711315 42.2955231667
20 40.4657788277 43.3965361118
50 120.064239025 128.437549829
100 288.91673708 299.555378914
200 584.331137896 609.95442009
@rohitdholakia
rohitdholakia / makefile
Created May 9, 2013 06:34
Example makefile
all:
python Lookup.py recap.input Outputs mapping.map 1 > Measurements/1.txt
python Lookup.py recap.input Outputs mapping.map 4 > Measurements/4.txt
python Lookup.py recap.input Outputs mapping.map 20 > Measurements/20.txt
python Lookup.py recap.input Outputs mapping.map 50 > Measurements/50.txt
python Lookup.py recap.input Outputs mapping.map 100 > Measurements/100.txt
python Lookup.py recap.input Outputs mapping.map 200 > Measurements/200.txt
@rohitdholakia
rohitdholakia / Lookup-Hashes.py
Last active December 17, 2015 03:48
Lookup with Hashes
import sys
import os
import multiprocessing
from collections import defaultdict
from TrueCasing import *
import time
import redis
import gzip
class Project:
@rohitdholakia
rohitdholakia / Lookup.py
Created May 9, 2013 05:50
Looking up redis when input stored without using hashes
import sys
import os
import multiprocessing
from collections import defaultdict
from TrueCasing import *
import time
import redis
import gzip
class Project: