Last active
December 21, 2017 05:12
-
-
Save itsecurityco/1e79596fac69fbdd49a33784e2e766f2 to your computer and use it in GitHub Desktop.
Scrape wordpress plugins
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/python | |
| # @itseco | |
| # https://github.com/itseco/ | |
| # Extract download url's for popular Wordpress plugins | |
| # Usage python script.py pages | |
| # python script.py 99 | |
| from lxml import html | |
| import sys | |
| import requests | |
| from requests.packages.urllib3.exceptions import InsecureRequestWarning | |
| requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
| # number of pages | |
| pages = int(sys.argv[1]) + 1 | |
| for page in range(1, pages): | |
| # all plugins (3,819 pages) | |
| # url = "https://wordpress.org/plugins/page/%s/?s" % page | |
| # popular plugins (99 pages) | |
| url = "https://cl.wordpress.org/plugins/browse/popular/page/%s/" % page | |
| page = requests.get(url, verify=False) | |
| tree = html.fromstring(page.content) | |
| plugins = tree.xpath('//h2[@class="entry-title"]/a/@href') | |
| for plugin in plugins: | |
| page = requests.get(plugin, verify=False) | |
| tree = html.fromstring(page.content) | |
| link = tree.xpath('//div[@class="plugin-actions"]/a/@href') | |
| print ''.join(link) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment