xgqfrms-GitHub/python-spider.md Secret

Last active June 6, 2017 03:46

Star (1) You must be signed in to star a gist
Fork (0) You must be signed in to fork a gist

Learn more about clone URLs
Clone this repository at <script src="https://gist.github.com/xgqfrms-GitHub/c8b3992534950ea68d7333951a795ebb.js"></script>
Save xgqfrms-GitHub/c8b3992534950ea68d7333951a795ebb to your computer and use it in GitHub Desktop.

Download ZIP

python spider

Raw

python-spider.md

python spider

https://github.com/xgqfrms/Python

https://libraries.io/github/xgqfrms-GitHub/servo

https://www.cnblogs.com/xgqfrms/tag/python/

https://abc.xgqfrms.xyz/Python/

https://www.python.org/dev/peps/pep-0008/

https://github.com/xgqfrms/Python/blob/gh-pages/ebooks/CodeSchool-TryDjango.pdf

restful api

https://www.cnblogs.com/xgqfrms/p/5828910.html

pythonanywhere

https://www.pythonanywhere.com/

在云中托管，运行和编码Python！

https://xgqfrms.pythonanywhere.com/

https://www.pythonanywhere.com/pricing/

others

https://github.com/FideoJ/get_stjszx/blob/master/get_stjszx.py

    

__author__ = 'Lin Jian' 
# -*- coding: utf-8 -*-
import urllib2
import re
import sys

def get_page(page_index, filename):
    page_url = 'http://www.stjszx.net/jszxgk/gk.asp?page=' + str(page_index)
    try:
        #headers = {'user-agent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'}
        request = urllib2.Request(page_url)
        response = urllib2.urlopen(request, timeout = 10)
        print 'Page {0}: Connected'.format(page_index)
    except urllib2.URLError, e:
        print 'Page {0}: {1}. Retry!'.format(page_index, e.reason)
        #exit()
        return False

    #lines = response.readlines()
    #valid_lines = lines[1056:]
    #content = ''.join(valid_lines).decode('gbk')

    content = u''
    for i, line in enumerate(response):
        if i > 1090:
            content += line.decode('gbk')
    #print 'Page {0}: Loaded'.format(page_index)

    pattern = re.compile(r'<tr>[\s ]*<td><div align="center">(?P<ID>\d+)</div></td>[\s ]*<td><div align="center">(?P<Name>.*?)</div></td>[\s ]*<td><div align="center">(?P<College>.*?)</div></td>[\s ]*<td><div align="center">(?P<Major>.*?)</div></td>[\s ]*<td><div align="center">(?P<ArriveDate>.*?)&nbsp;</div></td>[\s ]*<td><div align="center">(?P<ArriveID>.*?)&nbsp;</div></td>[\s ]*</tr>', re.S)
    persons = re.findall(pattern, content)
    #print 'Page {0}: Found'.format(page_index)
    
    with open(filename, 'a') as f:
        for person in persons:
            for item in person:
                f.write(u'{0:15}'.format(item).encode('utf-8'))
            f.write(u'\n'.encode('utf-8'))
    print 'Page {0}: Done'.format(page_index)
    return True


for page_index in range(1,12):
    while (get_page(page_index, sys.argv[1]) == False):
        pass

爬取汕头市金山中学官网上2016级高考录取信息网址：http://www.stjszx.net/jszxgk/gk.asp

Author

xgqfrms-GitHub commented Jun 3, 2017 •

edited

Loading

svg logos

https://rollbar.com/docs/

https://rollbar.com/assets/shared/logos/react-icon.svg

https://rollbar.com/assets/shared/logos/angular-icon.svg

https://rollbar.com/assets/shared/logos/javascript-icon.png

<ul class="image-list">
    <li><a href="/docs/notifier/rollbar.js/" class="javascript">Javascript</a></li>
    <li><a href="/docs/notifier/rollbar.js/" class="react">React</a></li>
    <li><a href="https://github.com/tandibar/ng-rollbar" class="angular">Angular 1</a></li>
    <li><a href="/docs/notifier/rollbar.js/#angular-2" class="angular">Angular 2</a></li>
    <li><a href="https://github.com/davewasmer/ember-cli-rollbar" class="ember">Ember</a></li>
    <li>
      <p><a href="/docs/notifier/rollbar.js/" class="backbone">BackboneJS</a></p>
    </li>
    <li><a href="/docs/notifier/rollbar-android/" class="android">Android</a></li>
    <li><a href="/docs/notifier/rollbar-ios/" class="ios">iOS</a></li>
    <li><a href="https://github.com/emilyemorehouse/cordova-plugins-rollbar" class="ionic">Ionic</a></li>
    <li><a href="/docs/notifier/flash_rollbar/" class="flash">Flash</a></li>
    <li><a href="/docs/items_other/" class="other">Others</a></li>
  </ul>

??? how to get the `::before` & `background-image`

js spider ???

    // TODO list

Author

xgqfrms-GitHub commented Jun 3, 2017

https://rollbar.com/assets/shared/logos/javascript-icon.png

Author

xgqfrms-GitHub commented Jun 3, 2017 •

edited

Loading

// image-list

/*
https://developer.mozilla.org/zh-CN/docs/Web/API/Document/getSelection

https://developer.mozilla.org/zh-CN/docs/Web/API/Window/getSelection

返回一个  Selection 对象，表示用户选择的文本范围或插入符号的当前位置。


selection 是一个 Selection 对象。 
如果想要将 selection 转换为字符串，可通过连接一个空字符串（""）或使用 String.toString() 方法。

function foo() {
    let selObj = window.getSelection(); 
    console.log(selObj);
    let selRange = selObj.getRangeAt(0);
    // 其他代码
}

https://developer.mozilla.org/zh-CN/docs/Web/API/HTMLInputElement/setSelectionRange



*/



// image-list

// document.querySelectorAll(`ul.image-list a::before`);


const as = document.querySelectorAll(`ul.image-list a`);

as[0];

/*

<a href="/docs/notifier/rollbar-gem/" class="ruby">
    ::before
    Ruby
</a>

*/

Author

xgqfrms-GitHub commented Jun 3, 2017 •

edited

Loading

solution

https://stackoverflow.com/questions/44342065/how-to-get-a-dom-elements-before-content-with-javscript

// https://rollbar.com/docs/

// image-list

// document.querySelectorAll(`ul.image-list a::before`);


const links = document.querySelectorAll(`ul.image-list a`);

links[0];

links[0].textContent;
// "Ruby"

links[0].innerText;
// "Ruby"

links[0].innerHTML;
// "Ruby"


console.log(getComputedStyle(links[0], ':before').getPropertyValue('background-image'));
// url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")

let img = getComputedStyle(links[0], '::before').getPropertyValue('background-image');
// url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")


// string & regex

`getComputedStyle()` & `getPropertyValue()`

Author

xgqfrms-GitHub commented Jun 3, 2017 •

edited

Loading

`getComputedStyle()` & `getPropertyValue()`

https://www.w3schools.com/cssref/sel_before.asp

CSSStyleDeclaration `Object Properties` & CSSStyleDeclaration `Object Methods`

https://www.w3schools.com/jsref/obj_cssstyledeclaration.asp

JavaScript`Window.getComputedStyle()` Method

https://www.w3schools.com/jsref/jsref_getcomputedstyle.asp

`getPropertyValue()`

https://www.w3schools.com/jsref/jsref_getcomputedstyle.asp

<div id="test" style="height: 50px;background-color: lightblue;">Test Div</div>
<p>The computed background color for the test div is: <span id="demo"></span></p>

<script>
function myFunction() {
    var elem = document.getElementById("test");
    var theCSSprop = window.getComputedStyle(elem, null).getPropertyValue("background-color");
    document.getElementById("demo").innerHTML = theCSSprop;
}
</script>

Author

xgqfrms-GitHub commented Jun 4, 2017

`document.createDocumentFragment` & `document.createElement`

https://developer.mozilla.org/zh-CN/docs/Web/API/Document/createDocumentFragment

// assuming it exists
let ul = document.getElementsByTagName("ul")[0],
    docfrag = document.createDocumentFragment();

const browserList = [
    "Internet Explorer", 
    "Mozilla Firefox", 
    "Safari", 
    "Chrome", 
    "Opera"
];

browserList.forEach((e) => {
    let li = document.createElement("li");
    li.textContent = e;
    docfrag.appendChild(li);
});

ul.appendChild(docfrag);

// a list with well-known web browsers

https://developer.yahoo.com/performance/rules.html

https://csspod.com/frontend-performance-best-practices/

Author

xgqfrms-GitHub commented Jun 4, 2017

`multi group capture/match`

https://developer.mozilla.org/zh-CN/docs/Web/JavaScript/Reference/Global_Objects/RegExp

let x =`url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")`;

let y = x.replace(/(url\(")|("\))/g,'');

//  "https://rollbar.com/assets/shared/logos/ruby-icon.svg"

https://gist.github.com/xgqfrms-GitHub/c8b3992534950ea68d7333951a795ebb

https://rollbar.com/docs/

Author

xgqfrms-GitHub commented Jun 4, 2017

All in One === OK

const links = document.querySelectorAll(`ul.image-list a`);

console.log(getComputedStyle(links[0], ':before').getPropertyValue('background-image'));
// url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")

let img = getComputedStyle(links[0], '::before').getPropertyValue('background-image');
// url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")

let newImage= img.replace(/(url\(")|("\))/g,'');
// "https://rollbar.com/assets/shared/logos/ruby-icon.svg"

console.log(newImage);
// https://rollbar.com/assets/shared/logos/ruby-icon.svg

Author

xgqfrms-GitHub commented Jun 4, 2017

console.log(getComputedStyle(links[1], ':before').getPropertyValue('background-image'));
// url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")

img = getComputedStyle(links[1], '::before').getPropertyValue('background-image');
// url("https://rollbar.com/assets/shared/logos/ruby-icon.svg")

newImage= img.replace(/(url\(")|("\))/g,'');
// "https://rollbar.com/assets/shared/logos/ruby-icon.svg"