Cheng-Jun Wang chengjun

🏠

Working from home

Associate Professor at Nanjing University

chengjun / renren_split_for_sorting.py

Created July 23, 2014 08:27

	# -- coding: utf-8 --
	"""
	Spyder Editor

	This temporary script file is located here:
	D:\chengjun\WinPython-64bit-2.7.6.4\settings\.spyder2\.temp.py
	"""

	'''
	# Step2: split the duplicated data into about 2000+ files by user ids

chengjun / Renren_id2string_university.py

Created July 23, 2014 07:03

	# !/user/bin/python
	# coding: -- utf-8 --
	from __future__ import division
	import os,time,string,random

	def network():

	#select the content which has been shared more than 500 times
	user = {}
	fname = 'd:/renren/id2string.txt'

chengjun / verify_fully_downloading_stackexchange_dump.py

Last active August 29, 2015 14:03

	# -- coding: utf-8 --
	"""
	Spyder Editor

	This temporary script file is located here:
	C:\Users\chengwang6\.spyder2\.temp.py
	"""

	# I download the dump from https://ia601509.us.archive.org/10/items/stackexchange/
	# I copy the names into a stack_namelist.txt

chengjun / digg2clickstream.R

Created July 7, 2014 10:00

	#---------load data------------#
	setwd("F:/digg/")
	ft = read.csv("./final_front_zero_mean_time.csv", head=T, na.string='NA', stringsAsFactors=T)
	dat = read.csv("./digg_votes_threshold.csv", header=T, stringsAsFactors = F)
	# storyid user time threshold
	# 1 oay as a session

	time = dat$time
	time = as.POSIXct(time, origin="1970-01-01")
	time = as.numeric(as.Date(time)) - 14394

chengjun / jump_out_of_for_loop.py

Last active August 29, 2015 14:03

chengjun / crowdspring_crawler.py

Last active August 29, 2015 14:03

	# -- coding: utf-8 --
	"""
	Spyder Editor

	This temporary script file is located here:
	C:\Users\chengwang6\.spyder2\.temp.py
	"""
	import os
	import urllib2
	from bs4 import BeautifulSoup

chengjun / crawl_thread_urls

Created July 1, 2014 11:56

	import os
	import urllib2
	from bs4 import BeautifulSoup
	from time import clock
	from time import sleep
	from random import randint
	# Showing 31,065 closed projects with a public gallery. 1295 pages
	# aggregate pages-->thread names--->thread replies

	'''

chengjun / calculate_sentiment

Last active August 29, 2015 14:03

calculate_sentiment

	def caculate_single_sentence_sentiment(group):
	gsen = 0
	no_words = group[0][0][0][0][0] # 否定词
	level_words = group[0][0][0][0][1] # 程度词
	sen_words = group[0][0][0][0][2] # 情感词
	no_words_positions = [no_word[2] for no_word in no_words]
	level_words_positions = [level_word[2] for level_word in level_words]
	num_no_words = len(no_words)
	weight = 1
	# 第一层：根据否定词的个数进行句法分析

chengjun / mass-elite robustness test.R

Last active August 29, 2015 14:02

mass-elite robustness test

	######################
	"mass-elite robustness test"
	######################
	topic = read.csv("./human coding_Ashoka_combined_label_20140404.csv",
	header = FALSE, stringsAsFactors = FALSE)
	cat_names = c("Environment", "Civic Engagement", "Learning/Education",
	"Human Rights", "Health", "Economic Development" )

	# section_id = 3

chengjun / scrape thread info using mechanize in python

Created June 2, 2014 04:58

scrape thread info using mechanize in python

	# -- coding: utf-8 --
	"""
	Spyder Editor

	This temporary script file is located here:
	C:\Users\chengwang6\Desktop\WinPython-64bit-2.7.6.4\settings\.spyder2\.temp.py
	"""
	import mechanize
	import cookielib
	from bs4 import BeautifulSoup