-
-
Save arvind-india/fc440085c4b8f97bc837f8f220f0263e to your computer and use it in GitHub Desktop.
SharePoint downloader for Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Download all document links from a web page. | |
Used to extract all the docouments from a Sharepoint document library. | |
by Mike Koss, September, 2009 | |
Notes: | |
9/14/09 mck: | |
Using Beautiful soup version 3.0.7a. The current version (3.1.0.1) is NOT resilient | |
to the errors in my sharepoint document directory - since it uses the HTMLParser | |
module which itself raises an exception on reading a malformed tag. | |
""" | |
import os | |
import sys | |
import getopt | |
import re | |
import urllib | |
import urlparse | |
from BeautifulSoup import BeautifulStoneSoup | |
reExtensions = re.compile(r"^.*\.(docx?|xls|fla|swf|txt|jpg|jpeg|gif|png|pdf|zip|mdbx?|mht|xml)$", re.I) | |
def main(): | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], "hd:t:u:p:a:") | |
except getopt.GetoptError, err: | |
print str(err) | |
usage() | |
sys.exit(2) | |
if len(opts) == 0: | |
usage() | |
sys.exit(2) | |
targetDir = None | |
for o, a in opts: | |
if o == '-h': | |
usage() | |
sys.exit() | |
if o == '-t': | |
targetDir = a | |
if o == '-d': | |
url = a | |
get_url_files(url, targetDir) | |
def usage(): | |
print "Usage: %s -d url [-h | -t target_dir]\n" % os.path.basename(sys.argv[0]) | |
print "Options\n" | |
print "-h\t\t: help" | |
print "-d\t\t: download the linked docments referenced on the web page" | |
print "-t\t\t: target directory to place the files" | |
def get_url_files(url, targetDir=None): | |
if targetDir is None: | |
parts = urlparse.urlparse(url) | |
targetDir = "%s_files" % parts[1] | |
print "Scanning %s, copy files to -> %s" % (url, targetDir) | |
(links, dirs) = extract_links(url) | |
if len(links) == 0: | |
print "No files to download." | |
else: | |
print "Found %d files to download." % len(links) | |
download_files(links, targetDir) | |
# Recursively download all the documents in sub-directories | |
for (url, sDir) in dirs: | |
get_url_files(url, "%s/%s" % (targetDir, sDir)) | |
reSPDir = re.compile(r"^.*SubmitFormPost\(['\"](.+)&View=.*['\"]\).*$", re.I) | |
reSPPath = re.compile(r"^.*RootFolder=%2f(.*)$") | |
def extract_links(url): | |
""" | |
Scan a web page for all the <a> tags referencing documents (must have one of the | |
extensions in reExtensions). | |
Returns an array of (fully qualified) urls to documents. | |
""" | |
sock = urllib.urlopen(url) | |
htmlSource = sock.read() | |
sock.close() | |
links = [] | |
dirs = [] | |
soup = BeautifulStoneSoup(htmlSource) | |
for link in soup.findAll('a'): | |
href = urlparse.urljoin(url, link['href']) | |
if reExtensions.match(href) is None: | |
continue | |
links.append(href) | |
links = list(set(links)) | |
mUrls = set() | |
# SharePoint directories are not regular href's - pull path info from onclick javascript | |
# Example onclick: | |
# | |
# javascript:ClearSearchTerm("{8EF6AB92-467B-410F-94E3-82048923368B}"); | |
# javascript:SubmitFormPost("http://old.mit-club.org/WebContent/Forms/AllItems.aspx? | |
# RootFolder=%2fWebContent%2fDummy%20Folder& | |
# View=%7b8EF6AB92%2d467B%2d410F%2d94E3%2d82048923368B%7d");javascript:return false; | |
for link in soup.findAll('a', href='javascript:SubmitFormPost()'): | |
matchDir = reSPDir.match(link['onclick']) | |
if matchDir is None: | |
print "Error parsing onclick directory name: %r" % link['onclick'] | |
continue | |
url = matchDir.group(1) | |
if url in mUrls: | |
continue | |
mUrls.add(url) | |
print "url: %s" % url | |
matchPath = reSPPath.match(url) | |
aPath = matchPath.group(1).split("%2f") | |
dirs.append((url, aPath[-1])) | |
return (links, dirs) | |
reFilename = re.compile("^.*\/([^\/]+)$") | |
def download_files(links, targetDir): | |
if not os.path.exists(targetDir): | |
os.makedirs(targetDir) | |
for link in links: | |
parts = urlparse.urlparse(link) | |
match = reFilename.match(parts[2]) | |
if match is None: | |
raise Exception("Error processing file name: %s" % link) | |
sFilename = "%s/%s" % (targetDir, match.group(1)) | |
if os.path.exists(sFilename): | |
print "File exists (%s) - skipping" % sFilename | |
continue | |
else: | |
print "Writing file: %s" % sFilename | |
file = open(sFilename, 'wb') | |
sock = urllib.urlopen(link) | |
data = sock.read() | |
sock.close() | |
file.write(data) | |
file.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment