nanxstats · September 30, 2012 16:03 · yanping · Oct 9, 2012
diff --git a/test.R b/test.R
 require(XML)

 pg1 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index.shtml'
 pg2 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_2.shtml'
 pg3 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_3.shtml'
 pg4 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_4.shtml'

 url1 = htmlTreeParse(pg1, useInternal = TRUE)
 url2 = htmlTreeParse(pg2, useInternal = TRUE)
 url3 = htmlTreeParse(pg3, useInternal = TRUE)
 url4 = htmlTreeParse(pg4, useInternal = TRUE)

 urls1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
 urls2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
 urls3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
 urls4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))

 title1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
 title2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
 title3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
 title4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))

 urls = paste('http://www.wxtj.gov.cn', c(urls1, urls2, urls3, urls4), sep = '')
 title = c(title1, title2, title3, title4)

 year = substr(title, 1, 4)
 tmp = sub('^.*年([^ ]+)月.*$', '\\1', title)
 tmp = sub('^.*1－([^ ]+).*$', '\\1', tmp)
 tmp = sub('^.*1―([^ ]+).*$', '\\1', tmp)
 tmp = sub('^.*1-([^ ]+).*$', '\\1', tmp)
 month = replace(tmp, which(nchar(tmp) == 1), 
                paste('0', tmp[which(nchar(tmp) == 1)], sep = ''))
 name = paste('data', year, month, sep = '_')

 for (i in 1:length(urls)) eval(parse(text = paste(name[i], "= readHTMLTable('", urls[i], "')[[1]]", sep = "")))
	require(XML)

	pg1 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index.shtml'
	pg2 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_2.shtml'
	pg3 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_3.shtml'
	pg4 = 'http://www.wxtj.gov.cn/tjxx/tjsj/ydzyzb/index_4.shtml'

	url1 = htmlTreeParse(pg1, useInternal = TRUE)
	url2 = htmlTreeParse(pg2, useInternal = TRUE)
	url3 = htmlTreeParse(pg3, useInternal = TRUE)
	url4 = htmlTreeParse(pg4, useInternal = TRUE)

	urls1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
	urls2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
	urls3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))
	urls4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "href"))

	title1 = unlist(xpathApply(url1, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
	title2 = unlist(xpathApply(url2, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
	title3 = unlist(xpathApply(url3, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))
	title4 = unlist(xpathApply(url4, path = "//tr//td[@class='newstitle']//a", xmlGetAttr, "title"))

	urls = paste('http://www.wxtj.gov.cn', c(urls1, urls2, urls3, urls4), sep = '')
	title = c(title1, title2, title3, title4)

	year = substr(title, 1, 4)
	tmp = sub('^.年([^ ]+)月.$', '\\1', title)
	tmp = sub('^.1－([^ ]+).$', '\\1', tmp)
	tmp = sub('^.1―([^ ]+).$', '\\1', tmp)
	tmp = sub('^.1-([^ ]+).$', '\\1', tmp)
	month = replace(tmp, which(nchar(tmp) == 1),
	paste('0', tmp[which(nchar(tmp) == 1)], sep = ''))
	name = paste('data', year, month, sep = '_')

	for (i in 1:length(urls)) eval(parse(text = paste(name[i], "= readHTMLTable('", urls[i], "')[[1]]", sep = "")))