Skip to content

Instantly share code, notes, and snippets.

View chengjun's full-sized avatar
🏠
Working from home

Cheng-Jun Wang chengjun

🏠
Working from home
View GitHub Profile
import urllib2
from bs4 import BeautifulSoup
import sys
# get the link for each chapter
url = "http://www.23wx.com/html/50/50550/" # 三界独尊
content = urllib2.urlopen(url).read()
soup = BeautifulSoup(content)
links = soup.find_all('td')[1000:]
@chengjun
chengjun / scholartree
Created July 8, 2015 05:43
tree network
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 07 15:40:57 2015
@author: chengwang6
"""
import urllib2
from bs4 import BeautifulSoup
## Set the seed of crawler
seed = 'https://scholar.google.nl/citations?user=nNdt_G8AAAAJ&hl=en&oe=ASCII'
@chengjun
chengjun / clean_wisenews.py
Created June 23, 2015 10:20
clean wise news data
with open("F:/百度云同步盘/Computational Communication/Data/占中数据20150328/zz-hk-2013.1-2013.3.rtf") as f:
news = f.readlines()
def stringclean(s):
s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b\cf6 ', '')
s = s.replace(r'\loch\af0\hich\af0\dbch\f15 \b0\cf0 ', '')
s = s.replace('\par', '').replace('\n', '')
return s
### Title: Back to basics: High quality plots using base R graphics
### An interactive tutorial for the Davis R Users Group meeting on April 24, 2015
###
### Date created: 20150418
### Last updated: 20150423
###
### Author: Michael Koontz
### Email: [email protected]
### Twitter: @michaeljkoontz
###
require(igraph)
# generate a social graph
node_number = 100
g = barabasi.game(node_number) ; plot(g)
seeds_num = 1
set.seed(2014); diffusers = sample(V(g),seeds_num) ; diffusers
infected =list()
infected[[1]]= diffusers
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 13 21:28:10 2014
@author: v_chjwang
"""
from os import listdir
import glob
from collections import defaultdict
#!/usr/bin/env python
# -*- coding: utf8 -*-
from weibo import APIClient
import urllib2
import urllib
import sys
import time
from time import clock
import random
import sys
from collections import defaultdict, Counter
import glob
reload(sys)
sys.setdefaultencoding('utf8')
path = "D:/chengjun/renren/"
‘’‘
step3: delte duplicates, sort data and save data
’‘’
import os
import glob
from collections import defaultdict
path = "D:/renren/friends_sorted/"
'''
# Step2: split the duplicated data into about 2000+ files by user ids
# to prepare for deleting the duplicated ties
'''
from collections import defaultdict
path = "D:/renren/"
bigfile = open(path + "friends_all.txt")