Created
April 10, 2020 12:33
-
-
Save chandruscm/381abe465002226a25f13a603a645fea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* NRLM Scraper by chandruscm | |
* -------------------------- | |
* - Intended for educational purposes ONLY 📖 | |
* - Use at your own risk ☠️ | |
* - Requires jsoup : https://jsoup.org | |
*/ | |
import org.jsoup.Connection | |
import org.jsoup.Jsoup | |
import org.jsoup.nodes.Document | |
import java.io.File | |
import java.io.IOException | |
/* | |
* encd -> Combination of state,district,block,grampanchayat,village codes. | |
* reqcode -> Unique code generated with each request, subsequent requests need to pass this. | |
* abc -> Needed for no apparent reason when requesting SHG members. | |
*/ | |
const val BASE_URL = "https://nrlm.gov.in/BlockWiseSHGMemebrsAction.do?methodName=showShgMembers" | |
const val ENCD = "&encd=" | |
const val REQ_CODE = "&reqcode=" | |
const val ABC = "&abc=1" | |
const val DEFAULT_ENCD = "01" | |
const val STATE_CODE_ID = "stateCodeId" | |
const val DISTRICT_CODE_ID = "districtCodeId" | |
const val BLOCK_CODE_ID = "blockCodeId" | |
const val GRAMPANCHAYAT_CODE_ID = "grampanchayatCodeId" | |
const val VILLAGE_CODE_ID = "villageCodeId" | |
const val STATE_CODE = "stateCode" | |
const val DISTRICT_CODE = "districtCode" | |
const val BLOCK_CODE = "blockCode" | |
const val GRAMPANCHAYAT_CODE = "grampanchayatCode" | |
const val VILLAGE_CODE = "villageCode" | |
const val CSV_HEADER = "State name,District Name,Block Name,Grampanchayat Name,Village Name,SHG Name,Member Name,Father/Husband,Gender,Age,Social Category,\n" | |
data class Object(val code: String, val name: String) | |
fun main() = fetch() | |
fun fetch() { | |
var url = "$BASE_URL$ENCD$DEFAULT_ENCD" | |
val request = Jsoup.connect(url).method(Connection.Method.GET).request() | |
var document = Jsoup.connect(url).get() | |
val file = File("data.csv").apply { | |
createNewFile() | |
appendText(CSV_HEADER) | |
} | |
getDropDownItems(document, STATE_CODE_ID).forEach { state -> | |
url = "$BASE_URL$REQ_CODE${getReqCode(document)}$ENCD${state.code}" | |
document = Jsoup.connect(url) | |
.data(STATE_CODE, state.code) | |
.cookies(request.cookies()) | |
.get() | |
println("Looking at ${state.name}") | |
getDropDownItems(document, DISTRICT_CODE_ID).forEach { district -> | |
url = "$BASE_URL$REQ_CODE${getReqCode(document)}$ENCD${district.code}" | |
document = Jsoup.connect(url) | |
.data(STATE_CODE, state.code) | |
.data(DISTRICT_CODE, district.code) | |
.cookies(request.cookies()) | |
.get() | |
println("Looking at ${district.name}") | |
getDropDownItems(document, BLOCK_CODE_ID).forEach { block -> | |
url = "$BASE_URL$REQ_CODE${getReqCode(document)}$ENCD${block.code}" | |
document = Jsoup.connect(url) | |
.data(STATE_CODE, state.code) | |
.data(DISTRICT_CODE, district.code) | |
.data(BLOCK_CODE, block.code) | |
.cookies(request.cookies()) | |
.get() | |
println("Looking at ${block.name}") | |
getDropDownItems(document, GRAMPANCHAYAT_CODE_ID).forEach { grampanchayat -> | |
url = "$BASE_URL$REQ_CODE${getReqCode(document)}$ENCD${grampanchayat.code}" | |
document = Jsoup.connect(url) | |
.data(STATE_CODE, state.code) | |
.data(DISTRICT_CODE, district.code) | |
.data(BLOCK_CODE, block.code) | |
.data(GRAMPANCHAYAT_CODE, grampanchayat.code) | |
.cookies(request.cookies()) | |
.get() | |
println("Looking at ${grampanchayat.name}") | |
getDropDownItems(document, VILLAGE_CODE_ID).forEach { village -> | |
url = "$BASE_URL$ABC$REQ_CODE${getReqCode(document)}$ENCD${village.code}" | |
document = Jsoup.connect(url) | |
.data(STATE_CODE, state.code) | |
.data(DISTRICT_CODE, district.code) | |
.data(BLOCK_CODE, block.code) | |
.data(GRAMPANCHAYAT_CODE, grampanchayat.code) | |
.data(VILLAGE_CODE, village.code) | |
.cookies(request.cookies()) | |
.post() | |
println("Writing data to file") | |
val table = document.select("tbody")?.first() | |
try { | |
table?.select("tr")?.forEach { row -> | |
row?.select("td")?.forEach { col -> | |
file.appendText("${col?.text() ?: ""},") | |
} | |
file.appendText("\n") | |
} | |
} catch (exception: IOException) { | |
println("Error writing to file") | |
exception.printStackTrace() | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
/* | |
* Extract the unique request code for each request. | |
*/ | |
fun getReqCode(document: Document) = | |
document.select("ul.nav.navbar-nav.navbar-right") | |
.select("li")[1] | |
.select(" > a") | |
.first() | |
.absUrl("href") | |
.split("=")[2] | |
/* | |
* Extract the items in a drop down box. | |
*/ | |
fun getDropDownItems(document: Document, id: String): MutableList<Object> { | |
val objects = mutableListOf<Object>() | |
val content = document.getElementById(id) | |
val children = content.children() | |
for (index in 1 until children.size) { | |
objects.add( | |
Object( | |
code = children[index].`val`(), | |
name = children[index].text() | |
) | |
) | |
} | |
return objects | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment