Skip to content

Instantly share code, notes, and snippets.

@jhejderup
Last active August 7, 2019 08:52
Show Gist options
  • Save jhejderup/3067378d4428c793f96788f5e9599911 to your computer and use it in GitHub Desktop.
Save jhejderup/3067378d4428c793f96788f5e9599911 to your computer and use it in GitHub Desktop.
Data cleaning
#Go to folders
cd /data/uppdatera/github/repos
## Install and do call-mining
time cat ../tests-working.txt | parallel 'cd {}; cmd="mvn install -Dmaven.test.skip=true"; $cmd &> /dev/null; [[ $? -eq 0 ]] && echo {}' > ../build-jars.txt
## Run AST Miner
time cat ../build-jars.txt | parallel 'cd {}; cmd="java -Xss256m -Xms2048m -Xmx4096m -jar /data/uppdatera/scripts/callminer/target/callminer.jar /data/uppdatera/github/repos/{}/pom.xml"; $cmd &> /dev/null; [[ $? -eq 0 ]] && echo {}' > ../ast-mining.txt
cat ast-mining.txt | grep -v "/home/jhejderup/.m2" > ast_mining2.txt
## Get failed projects and run with offline-mode
comm -13 <(sort ast_mining2.txt) <(sort build-jars.txt) > missing-call-asts.txt
##Rerun AST Miner
cat ../missing-call-asts.txt | parallel 'java -Xss256m -Xms2048m -Xmx4096m -jar /data/uppdatera/scripts/callminer/target/callminer.jar /data/uppdatera/github/repos/{}/pom.xml'
#!/bin/bash
IFS=$'\n' read -d '' -r -a rows < functions-to-inject.txt
readarray -t FILES < <(find repos/ -name call-asts-per-repo.txt);
for i in "${FILES[@]}"
do
for j in "${rows[@]}"
do
echo "grep -wq \"$j\" $i; [[ \$? -eq 0 ]] && echo $i"
done
done
# Pre-build dataset (Maven w/ quality assurance badge)
## Unique Repositories
cat pom-folders.txt | awk -F"/" '{print $1"/"$2}' | sort | uniq | wc -l
## Projects
cat pom-folders.txt | wc -l
# Post-build dataset (Maven Test/Install)
## Unique Repositories
cat build-jars.txt | awk -F"/" '{print $1"/"$2}' | sort | uniq | wc -l
## Projects
cat build-jars.txt | wc -l
# AST Mining of resolved function calls (ast_mining2.txt unreliable)
## Unique Repositories
find repos/ -name ast-calls-resolved.txt | awk -F"/" '{print $1"/"$2"/"$3}' | sort | uniq | wc -l
## Projects
find repos/ -name ast-calls-resolved.txt | wc -l
# Create evaluation dataset
find . -name ast-calls-resolved.txt | awk -F"/" '{print $2"/"$3}' | sort | uniq > ../evaluation-dataset.txt
# Generate a complimentary list of projects
find repos/ -name ast-calls-resolved.txt -printf "%h\n" > evaluation_mvn_projects.txt
# Shuffle the list
perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < evaluation_mvn_projects.txt > eval-pom-folder.txt
## Build dependency tree for each project
time cat eval-pom-folder.txt | parallel 'cd {}; mvn dependency:list -DoutputFile=uppdatera-deptree.txt -DoutputType=text'
#0. Create a single file with all function calls (each project should have uniq calls)
find . -name ast-calls-resolved.txt | parallel 'cat {} | sort | uniq' > ../all-ast-calls-raw.txt
## (Total of 4,253,407 calls form wc -l all-ast-calls-raw.txt (non-uniq))
## Uniq function calls 927,188 all-ast-calls-raw.txt
#1. Remove Java-based function calls (e.g., beginining with java)
cat all-ast-calls-raw.txt | grep -v ^java > all-ast-calls.txt
## wc -l --> 674,459 all-ast-calls.txt
#2. Aggregate calls per project
cat all-ast-calls.txt | sort | uniq -c | sort -bnr > aggregate_calls_per_project.txt
##################
################## Repo-based
##################
#1. Uniq calls per repositority (no-java calls)
cat ../evaluation-dataset.txt | parallel 'cd {}; find . -name ast-calls-resolved.txt -exec cat \{\} \; | grep -v ^java > call-asts-per-repo.txt'
find . -name call-asts-per-repo.txt | parallel 'cat {} | sort | uniq > {}.uniq';
#2. Concatenate into a single file
find . -name call-asts-per-repo.txt.uniq -exec cat {} \; > ../all-ast-calls-repo.txt
## 570,507 all-ast-calls-repo.txt
#3 Group by calls (remove junit and mock)
cat all-ast-calls-repo.txt | grep -v ^"org.junit" | grep -v ^junit | grep -v ^"org.mockito" | grep -v ^"org.easymock" | grep -v ^"org.jmock" | sort | uniq -c | sort -r -k1 -n > agg-calls-per-repo.txt
#4. Group by class (remove junit and mock)
cat all-ast-calls-repo.txt | sed 's/\.[a-zA-Z0-9]\+(.*)//g' | grep -v ^junit | grep -v ^"org.mockito" | grep -v ^"org.easymock" | grep -v ^"org.jmock" | sort | uniq -c | sort -r -k1 -n > agg-classes-per-repo.txt
###############
############### Call Coverage - 1210 repositories
###############
./call_reach_analysis.sh > cmdlist.txt
cat cmdlist.txt | parallel > dep-fn-coverage.txt
# Coverage
cat dep-fn-coverage.txt | sort | uniq | wc -l
#Go to folders
cd /data/uppdatera/github/repos
# Create list of repos to remove (non-maven repos)
ls -d */* | parallel 'cd {};lines=$(find . -type f -name pom.xml | wc -l); [[ $lines -eq 0 ]] && echo {}' > ../removed-non-pom-repos.txt
# Remove the repos
cat ../removed-non-pom-repos.txt | parallel 'rm -rf {}'
# Check for badge in README file
ls -d */* | parallel 'cd {};lines=$(find . -type f -iname "README*" | wc -l); [[ $lines -eq 0 ]] && echo {}' > ../remove-non-readme.txt
cat ../remove-non-readme.txt | parallel 'rm -rf {}'
#Find non-badge quality assurance repos
ls -d */* | parallel 'cd {}; lines=$(find . -type f -iname "README*" -exec cat \{\} \; | egrep "https\:\/\/codecov\.io\/gh\/|https\:\/\/coveralls\.io\/|https\:\/\/codeclimate\.com\/github\/|\!\[Build Status\]\(https\:\/\/travis\-ci\.org" | wc -l); [[ $lines -eq 0 ]] && echo {}' > ../remove-no-quality-assurance.txt
cat ../remove-no-quality-assurance.txt | parallel 'rm -rf {}'
#Remove empty folders
find . -type d -empty -delete -maxdepth 1
org.apache.commons.lang3.StringUtils
com.google.common.collect.Lists
com.google.common.base.Preconditions
org.apache.commons.io.FileUtils
org.apache.commons.io.IOUtils
com.google.gson.Gson
com.google.common.collect.ImmutableMap
com.google.common.collect.Maps
com.google.common.collect.Sets
com.google.common.collect.ImmutableList
com.google.common.base.Strings
com.fasterxml.jackson.databind.ObjectMapper
com.google.gson.GsonBuilder
com.google.common.collect.ImmutableSet
org.apache.commons.lang.StringUtils
com.google.common.collect.Iterables
com.google.common.base.Joiner
org.apache.http.impl.client.HttpClientBuilder
org.apache.http.impl.client.HttpClientBuilder
org.apache.commons.codec.binary.Base64
com.google.common.io.Files
com.google.gson.JsonObject
com.fasterxml.jackson.databind.JsonNode
org.apache.commons.cli.Options
org.apache.commons.cli.CommandLine
org.apache.commons.lang3.ArrayUtils
org.joda.time.DateTime
com.alibaba.fastjson.JSON
org.jsoup.nodes.Element
org.json.JSONObject
com.google.common.collect.Multimap
com.google.common.base.Objects
org.apache.commons.lang3.ArrayUtils
org.jsoup.Jsoup
org.json.JSONArray
org.joda.time.format.DateTimeFormatter
com.google.gson.JsonParser
org.yaml.snakeyaml.Yaml
com.fasterxml.jackson.core.JsonParser
org.apache.commons.lang3.builder.ToStringBuilder
com.google.common.io.BaseEncoding
org.joda.time.base.AbstractDateTime
org.antlr.v4.runtime.Recognizer
org.antlr.v4.runtime.Parser
org.w3c.dom.Node
org.joda.time.DateTimeZone
org.objectweb.asm.MethodVisitor
com.fasterxml.jackson.databind.DeserializationContext
com.thoughtworks.xstream.XStream
com.codahale.metrics.Histogram
net.sf.json.JSONObject
org.dom4j.Document
org.w3c.dom.Element
#Go to folders
cd /data/uppdatera/github/repos
# Get all folders with pom.xml file
ls -d */* | parallel 'find {} -name pom.xml -printf '"'"'%h\n'"'"'' > ../pom-folders.txt
# Shuffle the list
perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < pom-folders.txt > pom-folder.txt
# Remove examples
cat pom-folder.txt | grep -v examples > pom-folders.txt
# Run test suites and store working cases
time cat ../pom-folders.txt | parallel 'cd {}; cmd="mvn test -Dmaven.test.skip=false"; $cmd &> /dev/null; [[ $? -eq 0 ]] && echo {}' > ../tests-working.txt
com.google.common.collect.Lists.newArrayList()
org.apache.commons.lang3.StringUtils.isBlank(java.lang.CharSequence)
org.apache.commons.lang3.StringUtils.isEmpty(java.lang.CharSequence)
com.google.common.collect.Lists.newArrayList(E...)
org.apache.commons.lang3.StringUtils.isNotBlank(java.lang.CharSequence)
com.google.common.base.Strings.isNullOrEmpty(java.lang.String)
com.google.gson.GsonBuilder.create()
com.google.gson.Gson.toJson(java.lang.Object)
com.google.common.collect.Maps.newHashMap()
com.google.gson.Gson.fromJson(java.lang.String, java.lang.Class<T>)
org.apache.http.impl.client.HttpClientBuilder.build()
com.google.common.collect.Lists.newArrayList(java.lang.Iterable<? extends E>)
org.apache.commons.lang3.StringUtils.isNotEmpty(java.lang.CharSequence)
com.google.gson.JsonElement.getAsString()
org.apache.commons.io.IOUtils.copy(java.io.InputStream, java.io.OutputStream)
com.fasterxml.jackson.databind.ObjectMapper.writeValueAsString(java.lang.Object)
com.google.common.base.Objects.equal(java.lang.Object, java.lang.Object)
org.apache.commons.io.IOUtils.closeQuietly(java.io.InputStream)
org.apache.commons.io.FileUtils.deleteDirectory(java.io.File)
org.apache.commons.lang3.StringUtils.join(java.lang.Iterable<?>, java.lang.String)
org.jsoup.nodes.Element.text()
com.google.gson.JsonParser.parse(java.lang.String)
com.google.gson.JsonElement.getAsJsonArray()
org.apache.commons.lang3.StringUtils.replace(java.lang.String, java.lang.String, java.lang.String)
org.apache.commons.io.IOUtils.toString(java.io.InputStream, java.lang.String)
org.apache.commons.io.IOUtils.toString(java.io.InputStream)
org.jsoup.nodes.Element.select(java.lang.String)
org.jsoup.Jsoup.parse(java.lang.String)
com.alibaba.fastjson.JSON.toJSONString(java.lang.Object)
org.joda.time.base.BaseDateTime.getMillis()
org.antlr.v4.runtime.atn.ATNDeserializer.deserialize(char[])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment