Skip to content

Instantly share code, notes, and snippets.

Created June 18, 2018 10:12
Show Gist options
  • Save gregtaole/046859ab26a9a0ed4ea85c3ac46688b8 to your computer and use it in GitHub Desktop.
Save gregtaole/046859ab26a9a0ed4ea85c3ac46688b8 to your computer and use it in GitHub Desktop.
Scrape to get list of python modules and then use the generated list to display dependencies of a python project
package main
import (
var wg sync.WaitGroup
var pyStdlib = []string{
func main() {
pathFlag := flag.String("d", ".", "Path to the directory containing the python source files")
excludeFlag := flag.String("e", "__pycache__", "Comma-separated list of directories to exclude")
excludeDirs := strings.Split(*excludeFlag, ",")
pyFiles := make([]string, 0)
err := filepath.Walk(*pathFlag, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("could not read filepath %q : %v", *pathFlag, err)
for _, dir := range excludeDirs {
if info.IsDir() && info.Name() == dir {
return filepath.SkipDir
matched, err := regexp.MatchString(".py", path)
if err != nil {
return fmt.Errorf("error applying regular expression to %q : %v", path, err)
if matched {
pyFiles = append(pyFiles, path)
return nil
if err != nil {
fmt.Printf("error while walking the directory tree at %q : %v", *pathFlag, err)
importsChan := make(chan string)
errChan := make(chan error)
for _, file := range pyFiles {
go findImports(file, importsChan, errChan)
imports := make([]string, 0)
go func() {
for val := range importsChan {
imports = append(imports, val)
go func() {
for err := range errChan {
fmt.Fprintf(os.Stderr, "%v", err)
packagesChan := make(chan string)
packages := make([]string, 0)
for _, importString := range imports {
go ParseImports(importString, packagesChan, errChan)
go func() {
for pack := range packagesChan {
packages = append(packages, pack)
uniq := unique(packages)
clean := removeStdlibAndUser(uniq, pyFiles)
for _, mod := range clean {
func findImports(filePath string, importsChan chan<- string, errChan chan<- error) {
defer wg.Done()
file, err := os.Open(filePath)
if err != nil {
errChan <- fmt.Errorf("could open file %v for reading : %v", filePath, err)
scanner := bufio.NewScanner(file)
for scanner.Scan() {
text := scanner.Text()
matches, err := regexp.MatchString("^import|from.*import", text)
if err != nil {
errChan <- fmt.Errorf("error while parsing regular expression ^import|from.*import : %v", err)
if matches {
importsChan <- text
if scanner.Err() != nil {
errChan <- fmt.Errorf("error while scanning %v : %v", filePath, err)
ParseImports extracts the name of the python library contained in importString
func ParseImports(importString string, packagesChan chan<- string, errChan chan<- error) {
defer wg.Done()
imp := strings.Split(importString, " ")[1]
if strings.Contains(imp, ".") {
module := strings.Split(imp, ".")[0]
packagesChan <- module
packagesChan <- imp
func unique(imports []string) []string {
uniqueImports := make(map[string]bool)
for _, imp := range imports {
_, ok := uniqueImports[imp]
if !ok {
uniqueImports[imp] = true
uniqueImportsList := make([]string, 0)
for key := range uniqueImports {
uniqueImportsList = append(uniqueImportsList, key)
return uniqueImportsList
func removeStdlibAndUser(uniq, pyFiles []string) []string {
clean := make([]string, 0)
for _, mod := range uniq {
found := false
for _, ex := range pyStdlib {
if mod == ex {
found = true
for _, ex := range pyFiles {
if strings.Contains(ex, mod) {
found = true
if !found {
clean = append(clean, mod)
return clean
package main
import (
const url = ""
func main() {
resp, err := http.Get(url)
if err != nil {
log.Fatalf("could not get url %v : %v", url, err)
defer resp.Body.Close()
doc, err := html.Parse(resp.Body)
if err != nil {
log.Fatalf("could not parse response body : %v", err)
content, err := getContent(doc)
if err != nil {
log.Fatalf("could not get content from parsed document : %v", err)
re := regexp.MustCompile("(<code class=\"xref\">)|(</code>)")
for _, mod := range content {
fmt.Printf("\t\"%v\",\n", re.ReplaceAllString(string(renderNode(mod)), ""))
func getContent(doc *html.Node) ([]*html.Node, error) {
modules := make([]*html.Node, 0)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, attr := range n.Attr {
if attr.Val == "xref" {
modules = append(modules, n)
for c := n.FirstChild; c != nil; c = c.NextSibling {
if len(modules) == 0 {
return nil, errors.New("\"code\" tag not found")
return modules, nil
func renderNode(n *html.Node) []byte {
var buf bytes.Buffer
w := io.Writer(&buf)
html.Render(w, n)
return buf.Bytes()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment