Created
June 18, 2018 10:12
-
-
Save gregtaole/046859ab26a9a0ed4ea85c3ac46688b8 to your computer and use it in GitHub Desktop.
Scrape https://docs.python.org/3/py-modindex.html to get list of python modules and then use the generated list to display dependencies of a python project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"flag" | |
"fmt" | |
"os" | |
"path/filepath" | |
"regexp" | |
"strings" | |
"sync" | |
) | |
var wg sync.WaitGroup | |
var pyStdlib = []string{ | |
"__future__", | |
"__main__", | |
"_dummy_thread", | |
"_thread", | |
"abc", | |
"aifc", | |
"argparse", | |
"array", | |
"ast", | |
"asynchat", | |
"asyncio", | |
"asyncore", | |
"atexit", | |
"audioop", | |
"base64", | |
"bdb", | |
"binascii", | |
"binhex", | |
"bisect", | |
"builtins", | |
"bz2", | |
"calendar", | |
"cgi", | |
"cgitb", | |
"chunk", | |
"cmath", | |
"cmd", | |
"code", | |
"codecs", | |
"codeop", | |
"collections", | |
"collections.abc", | |
"colorsys", | |
"compileall", | |
"concurrent", | |
"concurrent.futures", | |
"configparser", | |
"contextlib", | |
"copy", | |
"copyreg", | |
"cProfile", | |
"crypt", | |
"csv", | |
"ctypes", | |
"curses", | |
"curses.ascii", | |
"curses.panel", | |
"curses.textpad", | |
"datetime", | |
"dbm", | |
"dbm.dumb", | |
"dbm.gnu", | |
"dbm.ndbm", | |
"decimal", | |
"difflib", | |
"dis", | |
"distutils", | |
"distutils.archive_util", | |
"distutils.bcppcompiler", | |
"distutils.ccompiler", | |
"distutils.cmd", | |
"distutils.command", | |
"distutils.command.bdist", | |
"distutils.command.bdist_dumb", | |
"distutils.command.bdist_msi", | |
"distutils.command.bdist_packager", | |
"distutils.command.bdist_rpm", | |
"distutils.command.bdist_wininst", | |
"distutils.command.build", | |
"distutils.command.build_clib", | |
"distutils.command.build_ext", | |
"distutils.command.build_py", | |
"distutils.command.build_scripts", | |
"distutils.command.check", | |
"distutils.command.clean", | |
"distutils.command.config", | |
"distutils.command.install", | |
"distutils.command.install_data", | |
"distutils.command.install_headers", | |
"distutils.command.install_lib", | |
"distutils.command.install_scripts", | |
"distutils.command.register", | |
"distutils.command.sdist", | |
"distutils.core", | |
"distutils.cygwinccompiler", | |
"distutils.debug", | |
"distutils.dep_util", | |
"distutils.dir_util", | |
"distutils.dist", | |
"distutils.errors", | |
"distutils.extension", | |
"distutils.fancy_getopt", | |
"distutils.file_util", | |
"distutils.filelist", | |
"distutils.log", | |
"distutils.msvccompiler", | |
"distutils.spawn", | |
"distutils.sysconfig", | |
"distutils.text_file", | |
"distutils.unixccompiler", | |
"distutils.util", | |
"distutils.version", | |
"doctest", | |
"dummy_threading", | |
"email", | |
"email.charset", | |
"email.contentmanager", | |
"email.encoders", | |
"email.errors", | |
"email.generator", | |
"email.header", | |
"email.headerregistry", | |
"email.iterators", | |
"email.message", | |
"email.mime", | |
"email.parser", | |
"email.policy", | |
"email.utils", | |
"encodings", | |
"encodings.idna", | |
"encodings.mbcs", | |
"encodings.utf_8_sig", | |
"ensurepip", | |
"enum", | |
"errno", | |
"faulthandler", | |
"fcntl", | |
"filecmp", | |
"fileinput", | |
"fnmatch", | |
"formatter", | |
"fpectl", | |
"fractions", | |
"ftplib", | |
"functools", | |
"gc", | |
"getopt", | |
"getpass", | |
"gettext", | |
"glob", | |
"grp", | |
"gzip", | |
"hashlib", | |
"heapq", | |
"hmac", | |
"html", | |
"html.entities", | |
"html.parser", | |
"http", | |
"http.client", | |
"http.cookiejar", | |
"http.cookies", | |
"http.server", | |
"imaplib", | |
"imghdr", | |
"imp", | |
"importlib", | |
"importlib.abc", | |
"importlib.machinery", | |
"importlib.util", | |
"inspect", | |
"io", | |
"ipaddress", | |
"itertools", | |
"json", | |
"json.tool", | |
"keyword", | |
"lib2to3", | |
"linecache", | |
"locale", | |
"logging", | |
"logging.config", | |
"logging.handlers", | |
"lzma", | |
"macpath", | |
"mailbox", | |
"mailcap", | |
"marshal", | |
"math", | |
"mimetypes", | |
"mmap", | |
"modulefinder", | |
"msilib", | |
"msvcrt", | |
"multiprocessing", | |
"multiprocessing.connection", | |
"multiprocessing.dummy", | |
"multiprocessing.managers", | |
"multiprocessing.pool", | |
"multiprocessing.sharedctypes", | |
"netrc", | |
"nis", | |
"nntplib", | |
"numbers", | |
"operator", | |
"optparse", | |
"os", | |
"os.path", | |
"ossaudiodev", | |
"parser", | |
"pathlib", | |
"pdb", | |
"pickle", | |
"pickletools", | |
"pipes", | |
"pkgutil", | |
"platform", | |
"plistlib", | |
"poplib", | |
"posix", | |
"pprint", | |
"profile", | |
"pstats", | |
"pty", | |
"pwd", | |
"py_compile", | |
"pyclbr", | |
"pydoc", | |
"queue", | |
"quopri", | |
"random", | |
"re", | |
"readline", | |
"reprlib", | |
"resource", | |
"rlcompleter", | |
"runpy", | |
"sched", | |
"secrets", | |
"select", | |
"selectors", | |
"shelve", | |
"shlex", | |
"shutil", | |
"signal", | |
"site", | |
"smtpd", | |
"smtplib", | |
"sndhdr", | |
"socket", | |
"socketserver", | |
"spwd", | |
"sqlite3", | |
"ssl", | |
"stat", | |
"statistics", | |
"string", | |
"stringprep", | |
"struct", | |
"subprocess", | |
"sunau", | |
"symbol", | |
"symtable", | |
"sys", | |
"sysconfig", | |
"syslog", | |
"tabnanny", | |
"tarfile", | |
"telnetlib", | |
"tempfile", | |
"termios", | |
"test", | |
"test.support", | |
"textwrap", | |
"threading", | |
"time", | |
"timeit", | |
"tkinter", | |
"tkinter.scrolledtext", | |
"tkinter.tix", | |
"tkinter.ttk", | |
"token", | |
"tokenize", | |
"trace", | |
"traceback", | |
"tracemalloc", | |
"tty", | |
"turtle", | |
"turtledemo", | |
"types", | |
"typing", | |
"unicodedata", | |
"unittest", | |
"unittest.mock", | |
"urllib", | |
"urllib.error", | |
"urllib.parse", | |
"urllib.request", | |
"urllib.response", | |
"urllib.robotparser", | |
"uu", | |
"uuid", | |
"venv", | |
"warnings", | |
"wave", | |
"weakref", | |
"webbrowser", | |
"winreg", | |
"winsound", | |
"wsgiref", | |
"wsgiref.handlers", | |
"wsgiref.headers", | |
"wsgiref.simple_server", | |
"wsgiref.util", | |
"wsgiref.validate", | |
"xdrlib", | |
"xml", | |
"xml.dom", | |
"xml.dom.minidom", | |
"xml.dom.pulldom", | |
"xml.etree.ElementTree", | |
"xml.parsers.expat", | |
"xml.parsers.expat.errors", | |
"xml.parsers.expat.model", | |
"xml.sax", | |
"xml.sax.handler", | |
"xml.sax.saxutils", | |
"xml.sax.xmlreader", | |
"xmlrpc", | |
"xmlrpc.client", | |
"xmlrpc.server", | |
"zipapp", | |
"zipfile", | |
"zipimport", | |
"zlib", | |
} | |
func main() { | |
pathFlag := flag.String("d", ".", "Path to the directory containing the python source files") | |
excludeFlag := flag.String("e", "__pycache__", "Comma-separated list of directories to exclude") | |
flag.Parse() | |
excludeDirs := strings.Split(*excludeFlag, ",") | |
pyFiles := make([]string, 0) | |
err := filepath.Walk(*pathFlag, func(path string, info os.FileInfo, err error) error { | |
if err != nil { | |
return fmt.Errorf("could not read filepath %q : %v", *pathFlag, err) | |
} | |
for _, dir := range excludeDirs { | |
if info.IsDir() && info.Name() == dir { | |
return filepath.SkipDir | |
} | |
} | |
matched, err := regexp.MatchString(".py", path) | |
if err != nil { | |
return fmt.Errorf("error applying regular expression to %q : %v", path, err) | |
} | |
if matched { | |
pyFiles = append(pyFiles, path) | |
} | |
return nil | |
}) | |
if err != nil { | |
fmt.Printf("error while walking the directory tree at %q : %v", *pathFlag, err) | |
} | |
importsChan := make(chan string) | |
errChan := make(chan error) | |
wg.Add(len(pyFiles)) | |
for _, file := range pyFiles { | |
go findImports(file, importsChan, errChan) | |
} | |
imports := make([]string, 0) | |
go func() { | |
for val := range importsChan { | |
imports = append(imports, val) | |
} | |
}() | |
go func() { | |
for err := range errChan { | |
fmt.Fprintf(os.Stderr, "%v", err) | |
} | |
}() | |
wg.Wait() | |
packagesChan := make(chan string) | |
packages := make([]string, 0) | |
for _, importString := range imports { | |
wg.Add(1) | |
go ParseImports(importString, packagesChan, errChan) | |
} | |
go func() { | |
for pack := range packagesChan { | |
packages = append(packages, pack) | |
} | |
}() | |
wg.Wait() | |
uniq := unique(packages) | |
clean := removeStdlibAndUser(uniq, pyFiles) | |
for _, mod := range clean { | |
fmt.Println(mod) | |
} | |
} | |
func findImports(filePath string, importsChan chan<- string, errChan chan<- error) { | |
defer wg.Done() | |
file, err := os.Open(filePath) | |
if err != nil { | |
errChan <- fmt.Errorf("could open file %v for reading : %v", filePath, err) | |
return | |
} | |
scanner := bufio.NewScanner(file) | |
for scanner.Scan() { | |
text := scanner.Text() | |
matches, err := regexp.MatchString("^import|from.*import", text) | |
if err != nil { | |
errChan <- fmt.Errorf("error while parsing regular expression ^import|from.*import : %v", err) | |
return | |
} | |
if matches { | |
importsChan <- text | |
} | |
} | |
if scanner.Err() != nil { | |
errChan <- fmt.Errorf("error while scanning %v : %v", filePath, err) | |
} | |
} | |
/* | |
ParseImports extracts the name of the python library contained in importString | |
*/ | |
func ParseImports(importString string, packagesChan chan<- string, errChan chan<- error) { | |
defer wg.Done() | |
imp := strings.Split(importString, " ")[1] | |
if strings.Contains(imp, ".") { | |
module := strings.Split(imp, ".")[0] | |
packagesChan <- module | |
return | |
} | |
packagesChan <- imp | |
} | |
func unique(imports []string) []string { | |
uniqueImports := make(map[string]bool) | |
for _, imp := range imports { | |
_, ok := uniqueImports[imp] | |
if !ok { | |
uniqueImports[imp] = true | |
} | |
} | |
uniqueImportsList := make([]string, 0) | |
for key := range uniqueImports { | |
uniqueImportsList = append(uniqueImportsList, key) | |
} | |
return uniqueImportsList | |
} | |
func removeStdlibAndUser(uniq, pyFiles []string) []string { | |
clean := make([]string, 0) | |
for _, mod := range uniq { | |
found := false | |
for _, ex := range pyStdlib { | |
if mod == ex { | |
found = true | |
} | |
} | |
for _, ex := range pyFiles { | |
if strings.Contains(ex, mod) { | |
found = true | |
} | |
} | |
if !found { | |
clean = append(clean, mod) | |
} | |
} | |
return clean | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bytes" | |
"errors" | |
"fmt" | |
"io" | |
"log" | |
"net/http" | |
"regexp" | |
"golang.org/x/net/html" | |
) | |
const url = "https://docs.python.org/3/py-modindex.html" | |
func main() { | |
resp, err := http.Get(url) | |
if err != nil { | |
log.Fatalf("could not get url %v : %v", url, err) | |
} | |
defer resp.Body.Close() | |
doc, err := html.Parse(resp.Body) | |
if err != nil { | |
log.Fatalf("could not parse response body : %v", err) | |
} | |
content, err := getContent(doc) | |
if err != nil { | |
log.Fatalf("could not get content from parsed document : %v", err) | |
} | |
fmt.Println("[]string{") | |
re := regexp.MustCompile("(<code class=\"xref\">)|(</code>)") | |
for _, mod := range content { | |
fmt.Printf("\t\"%v\",\n", re.ReplaceAllString(string(renderNode(mod)), "")) | |
} | |
fmt.Println("}") | |
} | |
func getContent(doc *html.Node) ([]*html.Node, error) { | |
modules := make([]*html.Node, 0) | |
var f func(*html.Node) | |
f = func(n *html.Node) { | |
if n.Type == html.ElementNode { | |
for _, attr := range n.Attr { | |
if attr.Val == "xref" { | |
modules = append(modules, n) | |
} | |
} | |
} | |
for c := n.FirstChild; c != nil; c = c.NextSibling { | |
f(c) | |
} | |
} | |
f(doc) | |
if len(modules) == 0 { | |
return nil, errors.New("\"code\" tag not found") | |
} | |
return modules, nil | |
} | |
func renderNode(n *html.Node) []byte { | |
var buf bytes.Buffer | |
w := io.Writer(&buf) | |
html.Render(w, n) | |
return buf.Bytes() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment