Skip to content

Instantly share code, notes, and snippets.

@Vbitz
Created April 26, 2023 01:35
Show Gist options
  • Save Vbitz/fbd79216fa413e77eb29c0731401ff3e to your computer and use it in GitHub Desktop.
Save Vbitz/fbd79216fa413e77eb29c0731401ff3e to your computer and use it in GitHub Desktop.
Github Star and Follow spidering script. Warning very hackish.
package main
import (
"context"
"crypto/sha256"
"encoding/csv"
"encoding/hex"
"encoding/json"
"errors"
"flag"
"fmt"
"log"
"os"
"path"
"strings"
"github.com/google/go-github/github"
"github.com/schollz/progressbar/v3"
"golang.org/x/oauth2"
)
var (
storePath = flag.String("store", "store/", "The path to store all findings.")
maxDepth = flag.Int("depth", 8, "The maximum recursion depth.")
)
var (
ErrBlacklist = fmt.Errorf("Blacklisted")
)
type PathKind string
const (
KIND_USER PathKind = "users"
KIND_USER_REPOS PathKind = "userRepos"
KIND_USER_STARS PathKind = "userStars"
KIND_REPO PathKind = "repos"
)
func getPath(kind PathKind, name string) string {
nameHash := sha256.Sum256([]byte(name))
nameHex := hex.EncodeToString(nameHash[:])
return path.Join(*storePath, string(kind), nameHex)
}
type FlaggedRepo struct {
Owner string
Name string
Description string
Skipped bool
}
type MalwareIntel struct {
client *github.Client
foundUsers map[string]bool
foundRepos map[string]bool
wordList []string
startUsers []string
blacklistUsers map[string]bool
blacklistRepos map[string]bool
jobs []Job
flaggedRepos []FlaggedRepo
}
func (a *MalwareIntel) pathExists(path string) (bool, error) {
_, err := os.Stat(path)
if errors.Is(err, os.ErrNotExist) {
return false, nil
} else if err != nil {
return false, err
} else {
return true, nil
}
}
func (a *MalwareIntel) ensurePath(path string) error {
err := os.MkdirAll(path, os.ModePerm)
if err != nil {
return err
}
return nil
}
func (a *MalwareIntel) readCache(p string, ret any) (bool, error) {
// infoPath := path.Join(p, "info.json")
contentPath := path.Join(p, "content.json")
exists, err := a.pathExists(p)
if err != nil {
return false, err
}
if exists {
f, err := os.Open(contentPath)
if errors.Is(err, os.ErrNotExist) {
// The file not existing just suggests that the directory needs to be created.
return false, nil
} else if err != nil {
return false, err
}
defer f.Close()
dec := json.NewDecoder(f)
err = dec.Decode(ret)
if err != nil {
return false, err
}
return true, nil
} else {
err := a.ensurePath(p)
if err != nil {
return false, err
}
return false, nil
}
}
func (a *MalwareIntel) writeCache(p string, content any) error {
// infoPath := path.Join(p, "info.json")
contentPath := path.Join(p, "content.json")
f, err := os.Create(contentPath)
if err != nil {
return err
}
defer f.Close()
enc := json.NewEncoder(f)
err = enc.Encode(content)
if err != nil {
return err
}
return nil
}
func (a *MalwareIntel) GetUserList() []string {
return a.startUsers
}
func (a *MalwareIntel) GetUser(name string) (*github.User, error) {
userPath := getPath(KIND_USER, name)
var user *github.User
ok, err := a.readCache(userPath, &user)
if err != nil {
return nil, err
}
if !ok {
log.Printf("cache miss for user: %s", name)
user, _, err = a.client.Users.Get(context.Background(), name)
if err != nil {
return nil, err
}
err = a.writeCache(userPath, user)
if err != nil {
return nil, err
}
}
return user, nil
}
func (a *MalwareIntel) GetUserRepositories(u *github.User) ([]*github.Repository, error) {
name := u.GetLogin()
userPath := getPath(KIND_USER_REPOS, name)
var repos []*github.Repository
ok, err := a.readCache(userPath, &repos)
if err != nil {
return nil, err
}
if !ok {
log.Printf("cache miss for user repo list: %s", name)
repos, _, err = a.client.Repositories.List(context.Background(), name, &github.RepositoryListOptions{})
if err != nil {
return nil, err
}
err = a.writeCache(userPath, repos)
if err != nil {
return nil, err
}
}
return repos, nil
}
func (a *MalwareIntel) GetUserStarredRepositories(u *github.User) ([]*github.StarredRepository, error) {
name := u.GetLogin()
userPath := getPath(KIND_USER_STARS, name)
var repos []*github.StarredRepository
ok, err := a.readCache(userPath, &repos)
if err != nil {
return nil, err
}
if !ok {
log.Printf("cache miss for user starred list: %s", name)
repos, _, err = a.client.Activity.ListStarred(context.Background(), name, &github.ActivityListStarredOptions{})
if err != nil {
return nil, err
}
err = a.writeCache(userPath, repos)
if err != nil {
return nil, err
}
}
return repos, nil
}
func (a *MalwareIntel) GetRepository(login string, name string) (*github.Repository, error) {
id := login + "/" + name
if _, ok := a.blacklistRepos[id]; ok {
return nil, ErrBlacklist
}
userPath := getPath(KIND_REPO, name)
var repo *github.Repository
ok, err := a.readCache(userPath, &repo)
if err != nil {
return nil, err
}
if !ok {
log.Printf("cache miss for repository: %s/%s", login, name)
repo, _, err = a.client.Repositories.Get(context.Background(), login, name)
if err != nil {
return nil, err
}
err = a.writeCache(userPath, repo)
if err != nil {
return nil, err
}
}
return repo, nil
}
func (a *MalwareIntel) IsFlagged(s string) bool {
lower := strings.ToLower(s)
for _, word := range a.wordList {
if strings.Contains(lower, word) {
return true
}
}
return false
}
type Job interface {
Run() error
}
func (a *MalwareIntel) SubmitJob(job Job) {
a.jobs = append(a.jobs, job)
}
func (a *MalwareIntel) AnalyzeUser(user *github.User, depth int) error {
// Make sure we haven't already looked at this user.
if _, ok := a.foundUsers[user.GetLogin()]; ok {
return nil
}
a.SubmitJob(&AnalyzeUserJob{
app: a,
user: user,
depth: depth,
})
return nil
}
type AnalyzeUserJob struct {
app *MalwareIntel
user *github.User
depth int
}
func (j *AnalyzeUserJob) Run() error {
// Make sure we haven't already looked at this user.
if _, ok := j.app.foundUsers[j.user.GetLogin()]; ok {
return nil
}
if j.depth <= 0 {
// log.Printf("Skipping %s due to recursion depth", j.user.GetLogin())
return nil
}
j.app.foundUsers[j.user.GetLogin()] = true
// log.Printf("[USER] %s : %s", j.user.GetLogin(), j.user.GetBio())
// Search though repositories.
repos, err := j.app.GetUserRepositories(j.user)
if err != nil {
return err
}
for _, repo := range repos {
err := j.app.AnalyzeRepo(repo, j.depth-1)
if err != nil {
return err
}
}
// Search though followers.
// Search though following.
// Search though starred.
starred, err := j.app.GetUserStarredRepositories(j.user)
if err != nil {
return err
}
for _, star := range starred {
err := j.app.AnalyzeRepo(star.GetRepository(), j.depth-1)
if err != nil {
return err
}
}
return nil
}
func (a *MalwareIntel) AnalyzeRepo(repo *github.Repository, depth int) error {
login := repo.GetOwner().GetLogin()
name := repo.GetName()
// Make sure we haven't already looked at this repository.
if _, ok := a.foundRepos[login+"/"+name]; ok {
return nil
}
a.SubmitJob(&AnalyzeRepoJob{
app: a,
repo: repo,
depth: depth,
})
return nil
}
type AnalyzeRepoJob struct {
app *MalwareIntel
repo *github.Repository
depth int
}
func (j *AnalyzeRepoJob) Run() error {
login := j.repo.GetOwner().GetLogin()
name := j.repo.GetName()
// Make sure we haven't already looked at this repository.
if _, ok := j.app.foundRepos[login+"/"+name]; ok {
return nil
}
description := j.repo.GetDescription()
// TODO(joshua): Get repo README.md to scan it.
isFork := j.repo.GetFork()
details := fmt.Sprintf("%s/%s: %s", login, name, description)
var flagged = false
// Check if the repo is flagged.
if j.app.IsFlagged(login) || j.app.IsFlagged(name) || j.app.IsFlagged(description) {
details = "[FLAGGED] " + details
flagged = true
}
if j.depth <= 0 {
if flagged {
log.Printf("Skipping %s due to recursion depth", j.repo.GetFullName())
j.app.flaggedRepos = append(j.app.flaggedRepos, FlaggedRepo{
Owner: login,
Name: name,
Description: description,
Skipped: true,
})
}
return nil
}
j.app.foundRepos[login+"/"+name] = true
// Check if this repo is a fork.
if isFork && flagged {
fullDetails, err := j.app.GetRepository(j.repo.GetOwner().GetLogin(), j.repo.GetName())
if err == ErrBlacklist {
// This repo is blacklisted.
} else if err != nil {
return err
} else {
parent := fullDetails.GetParent()
// If it is then analyze the parent.
err = j.app.AnalyzeRepo(parent, j.depth-1)
if err != nil {
log.Fatal(err)
}
details = fmt.Sprintf("[FORK %s/%s] ", parent.GetOwner().GetLogin(), parent.GetName()) + details
}
}
if flagged {
log.Println("[REPO]", details)
j.app.flaggedRepos = append(j.app.flaggedRepos, FlaggedRepo{
Owner: login,
Name: name,
Description: description,
Skipped: false,
})
}
if flagged {
// Search through owner.
err := j.app.AnalyzeUser(j.repo.Owner, j.depth-1)
if err != nil {
log.Fatal(err)
}
// Search through forks.
// Search through starred.
// Search through follows.
}
return nil
}
func main() {
flag.Parse()
out, err := os.Create("output.log")
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
defer out.Close()
log.SetOutput(out)
token, err := os.ReadFile("token.txt")
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
ctx := context.Background()
ts := oauth2.StaticTokenSource(
&oauth2.Token{AccessToken: string(token)},
)
tc := oauth2.NewClient(ctx, ts)
client := github.NewClient(tc)
app := &MalwareIntel{
client: client,
foundUsers: make(map[string]bool),
foundRepos: make(map[string]bool),
blacklistUsers: make(map[string]bool),
blacklistRepos: make(map[string]bool),
}
blacklist, err := os.ReadFile("blacklist.txt")
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
startUsers, err := os.ReadFile("users.txt")
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
app.startUsers = strings.Split(string(startUsers), "\n")
flaggedWords, err := os.ReadFile("flagged.txt")
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
app.wordList = strings.Split(string(flaggedWords), "\n")
for _, item := range strings.Split(string(blacklist), "\n") {
tokens := strings.Split(item, " ")
if tokens[0] == "user" {
app.blacklistUsers[tokens[1]] = true
} else if tokens[0] == "repo" {
app.blacklistRepos[tokens[1]] = true
}
}
// Enumerate though each user in the store.
for _, login := range app.GetUserList() {
// Force the user to be enumerated.
delete(app.foundUsers, login)
// Get all user details.
user, err := app.GetUser(login)
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
err = app.AnalyzeUser(user, *maxDepth)
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
}
prog := progressbar.Default(-1, "Executing Jobs")
for {
if len(app.jobs) == 0 {
break
}
i := 0
job := app.jobs[i]
app.jobs = append(app.jobs[:i], app.jobs[i+1:]...)
err := job.Run()
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
prog.Add(1)
prog.Describe(fmt.Sprintf("Executing Jobs: currently waiting: %d", len(app.jobs)))
}
outputCsv, err := os.Create("output.csv")
if err != nil {
fmt.Printf("fatal: %v", err)
return
}
defer outputCsv.Close()
csvWriter := csv.NewWriter(outputCsv)
for _, repo := range app.flaggedRepos {
csvWriter.Write([]string{
repo.Owner, repo.Name, repo.Description, func() string {
if repo.Skipped {
return "true"
} else {
return "false"
}
}(),
})
}
}
Copyright 2023 Joshua D. Scarsbrook
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment