Skip to content

Instantly share code, notes, and snippets.

View bryanyang0528's full-sized avatar
🎯
Focusing

Bryan Yang bryanyang0528

🎯
Focusing
View GitHub Profile
library("XML")
library("httr")
library("stringr")
library("igraph")
library("dplyr")
##這部分是爬網,先從網頁版PTT中抓下每篇文章的聯結
rm(data)
data <- list()
@bryanyang0528
bryanyang0528 / gist:7759a9c28c680c87f8d8
Last active August 29, 2015 14:05
R dplyr example 5
summarise(group_by(melted, sex, treatment, variable),
mean=mean(value), sd=sd(value))
melted %.% group_by(sex, treatment, variable) %.%
summarise(mean=mean(value), sd=sd(value))
Source: local data frame [8 x 5]
Groups: sex, treatment
sex treatment variable mean sd
1 1 1 response1 0.021856280 1.0124371
2 1 1 response2 0.045928150 1.0151670
3 1 2 response1 -0.065017971 0.9825428
4 1 2 response2 0.011512867 0.9463053
5 2 1 response1 -0.005374208 1.0095468
6 2 1 response2 -0.051699624 1.0154782
Source: local data frame [4,000 x 4]
Groups: sex, treatment, variable
sex treatment variable value
1 1 1 response1 -0.15668214
2 1 2 response1 -0.40934759
3 1 1 response1 0.07103731
4 1 2 response1 0.15113270
5 1 1 response1 0.30836910
6 1 2 response1 -1.41891407
## The code for the toy data is exactly the same
data <- data.frame(sex = c(rep(1, 1000), rep(2, 1000)),
treatment = rep(c(1, 2), 1000),
response1 = rnorm(2000, 0, 1),
response2 = rnorm(2000, 0, 1))
## reshape2 still does its thing:
library(reshape2)
melted <- melt(data, id.vars=c("sex", "treatment"))
def longTermPriority(path, maxTermLength, minFreq):
longTerms=[] #長詞
longTermsFreq=[] #長詞+次數分配
for i in range(maxTermLength,1,-1): ##字詞數由大至小
text_list = cutSentence(path,longTerms) #呼叫cutSentence function
#print len(text_list)
words_freq = ngram(text_list,i, minFreq) #呼叫 ngram function
#print i
def ngram(textLists,n,minFreq): #第一個參數放處理好的文章(LIST檔,utf-8編碼),第二個參數放字詞的長度單位,第三個參數放至少要幾次以上
words=[] #存放擷取出來的字詞
words_freq={}#存放字詞:計算個數
result= []
for textList in textLists:
for w in range(len(textList)-(n-1)): #要讀取的長度隨字詞長度改變
words.append(textList[w:w+n]) #抓取長度w-(n-1)的字串
for word in words:
@bryanyang0528
bryanyang0528 / gist:f5d87adb7d8729530393
Last active August 29, 2015 14:04
N-gram v2.0 part1
import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件
cutlist = "<>/::;;,、"’,.。!?「\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8") ##列出標點符號,並轉換成utf-8的格式
def cutSentence(text_path, keywords): ##放入原始文章路徑, 增加斷詞的list
import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件
cutlist = "<>/::;;,、"’,.。!?「\"\'\\\n\r《》“”!@#$%^&*()".decode("utf-8")
#text_new =""
import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件
text = codecs.open("text.txt","r","utf-8")
#讀取存成TXT檔的文字,讀入後統一轉成UTF-8格式
text_new =""
for line in text.readlines():