Last active
August 7, 2019 08:52
-
-
Save jhejderup/3067378d4428c793f96788f5e9599911 to your computer and use it in GitHub Desktop.
Data cleaning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Go to folders | |
cd /data/uppdatera/github/repos | |
## Install and do call-mining | |
time cat ../tests-working.txt | parallel 'cd {}; cmd="mvn install -Dmaven.test.skip=true"; $cmd &> /dev/null; [[ $? -eq 0 ]] && echo {}' > ../build-jars.txt | |
## Run AST Miner | |
time cat ../build-jars.txt | parallel 'cd {}; cmd="java -Xss256m -Xms2048m -Xmx4096m -jar /data/uppdatera/scripts/callminer/target/callminer.jar /data/uppdatera/github/repos/{}/pom.xml"; $cmd &> /dev/null; [[ $? -eq 0 ]] && echo {}' > ../ast-mining.txt | |
cat ast-mining.txt | grep -v "/home/jhejderup/.m2" > ast_mining2.txt | |
## Get failed projects and run with offline-mode | |
comm -13 <(sort ast_mining2.txt) <(sort build-jars.txt) > missing-call-asts.txt | |
##Rerun AST Miner | |
cat ../missing-call-asts.txt | parallel 'java -Xss256m -Xms2048m -Xmx4096m -jar /data/uppdatera/scripts/callminer/target/callminer.jar /data/uppdatera/github/repos/{}/pom.xml' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
IFS=$'\n' read -d '' -r -a rows < functions-to-inject.txt | |
readarray -t FILES < <(find repos/ -name call-asts-per-repo.txt); | |
for i in "${FILES[@]}" | |
do | |
for j in "${rows[@]}" | |
do | |
echo "grep -wq \"$j\" $i; [[ \$? -eq 0 ]] && echo $i" | |
done | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pre-build dataset (Maven w/ quality assurance badge) | |
## Unique Repositories | |
cat pom-folders.txt | awk -F"/" '{print $1"/"$2}' | sort | uniq | wc -l | |
## Projects | |
cat pom-folders.txt | wc -l | |
# Post-build dataset (Maven Test/Install) | |
## Unique Repositories | |
cat build-jars.txt | awk -F"/" '{print $1"/"$2}' | sort | uniq | wc -l | |
## Projects | |
cat build-jars.txt | wc -l | |
# AST Mining of resolved function calls (ast_mining2.txt unreliable) | |
## Unique Repositories | |
find repos/ -name ast-calls-resolved.txt | awk -F"/" '{print $1"/"$2"/"$3}' | sort | uniq | wc -l | |
## Projects | |
find repos/ -name ast-calls-resolved.txt | wc -l | |
# Create evaluation dataset | |
find . -name ast-calls-resolved.txt | awk -F"/" '{print $2"/"$3}' | sort | uniq > ../evaluation-dataset.txt | |
# Generate a complimentary list of projects | |
find repos/ -name ast-calls-resolved.txt -printf "%h\n" > evaluation_mvn_projects.txt | |
# Shuffle the list | |
perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < evaluation_mvn_projects.txt > eval-pom-folder.txt | |
## Build dependency tree for each project | |
time cat eval-pom-folder.txt | parallel 'cd {}; mvn dependency:list -DoutputFile=uppdatera-deptree.txt -DoutputType=text' | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#0. Create a single file with all function calls (each project should have uniq calls) | |
find . -name ast-calls-resolved.txt | parallel 'cat {} | sort | uniq' > ../all-ast-calls-raw.txt | |
## (Total of 4,253,407 calls form wc -l all-ast-calls-raw.txt (non-uniq)) | |
## Uniq function calls 927,188 all-ast-calls-raw.txt | |
#1. Remove Java-based function calls (e.g., beginining with java) | |
cat all-ast-calls-raw.txt | grep -v ^java > all-ast-calls.txt | |
## wc -l --> 674,459 all-ast-calls.txt | |
#2. Aggregate calls per project | |
cat all-ast-calls.txt | sort | uniq -c | sort -bnr > aggregate_calls_per_project.txt | |
################## | |
################## Repo-based | |
################## | |
#1. Uniq calls per repositority (no-java calls) | |
cat ../evaluation-dataset.txt | parallel 'cd {}; find . -name ast-calls-resolved.txt -exec cat \{\} \; | grep -v ^java > call-asts-per-repo.txt' | |
find . -name call-asts-per-repo.txt | parallel 'cat {} | sort | uniq > {}.uniq'; | |
#2. Concatenate into a single file | |
find . -name call-asts-per-repo.txt.uniq -exec cat {} \; > ../all-ast-calls-repo.txt | |
## 570,507 all-ast-calls-repo.txt | |
#3 Group by calls (remove junit and mock) | |
cat all-ast-calls-repo.txt | grep -v ^"org.junit" | grep -v ^junit | grep -v ^"org.mockito" | grep -v ^"org.easymock" | grep -v ^"org.jmock" | sort | uniq -c | sort -r -k1 -n > agg-calls-per-repo.txt | |
#4. Group by class (remove junit and mock) | |
cat all-ast-calls-repo.txt | sed 's/\.[a-zA-Z0-9]\+(.*)//g' | grep -v ^junit | grep -v ^"org.mockito" | grep -v ^"org.easymock" | grep -v ^"org.jmock" | sort | uniq -c | sort -r -k1 -n > agg-classes-per-repo.txt | |
############### | |
############### Call Coverage - 1210 repositories | |
############### | |
./call_reach_analysis.sh > cmdlist.txt | |
cat cmdlist.txt | parallel > dep-fn-coverage.txt | |
# Coverage | |
cat dep-fn-coverage.txt | sort | uniq | wc -l | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Go to folders | |
cd /data/uppdatera/github/repos | |
# Create list of repos to remove (non-maven repos) | |
ls -d */* | parallel 'cd {};lines=$(find . -type f -name pom.xml | wc -l); [[ $lines -eq 0 ]] && echo {}' > ../removed-non-pom-repos.txt | |
# Remove the repos | |
cat ../removed-non-pom-repos.txt | parallel 'rm -rf {}' | |
# Check for badge in README file | |
ls -d */* | parallel 'cd {};lines=$(find . -type f -iname "README*" | wc -l); [[ $lines -eq 0 ]] && echo {}' > ../remove-non-readme.txt | |
cat ../remove-non-readme.txt | parallel 'rm -rf {}' | |
#Find non-badge quality assurance repos | |
ls -d */* | parallel 'cd {}; lines=$(find . -type f -iname "README*" -exec cat \{\} \; | egrep "https\:\/\/codecov\.io\/gh\/|https\:\/\/coveralls\.io\/|https\:\/\/codeclimate\.com\/github\/|\!\[Build Status\]\(https\:\/\/travis\-ci\.org" | wc -l); [[ $lines -eq 0 ]] && echo {}' > ../remove-no-quality-assurance.txt | |
cat ../remove-no-quality-assurance.txt | parallel 'rm -rf {}' | |
#Remove empty folders | |
find . -type d -empty -delete -maxdepth 1 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
org.apache.commons.lang3.StringUtils | |
com.google.common.collect.Lists | |
com.google.common.base.Preconditions | |
org.apache.commons.io.FileUtils | |
org.apache.commons.io.IOUtils | |
com.google.gson.Gson | |
com.google.common.collect.ImmutableMap | |
com.google.common.collect.Maps | |
com.google.common.collect.Sets | |
com.google.common.collect.ImmutableList | |
com.google.common.base.Strings | |
com.fasterxml.jackson.databind.ObjectMapper | |
com.google.gson.GsonBuilder | |
com.google.common.collect.ImmutableSet | |
org.apache.commons.lang.StringUtils | |
com.google.common.collect.Iterables | |
com.google.common.base.Joiner | |
org.apache.http.impl.client.HttpClientBuilder | |
org.apache.http.impl.client.HttpClientBuilder | |
org.apache.commons.codec.binary.Base64 | |
com.google.common.io.Files | |
com.google.gson.JsonObject | |
com.fasterxml.jackson.databind.JsonNode | |
org.apache.commons.cli.Options | |
org.apache.commons.cli.CommandLine | |
org.apache.commons.lang3.ArrayUtils | |
org.joda.time.DateTime | |
com.alibaba.fastjson.JSON | |
org.jsoup.nodes.Element | |
org.json.JSONObject | |
com.google.common.collect.Multimap | |
com.google.common.base.Objects | |
org.apache.commons.lang3.ArrayUtils | |
org.jsoup.Jsoup | |
org.json.JSONArray | |
org.joda.time.format.DateTimeFormatter | |
com.google.gson.JsonParser | |
org.yaml.snakeyaml.Yaml | |
com.fasterxml.jackson.core.JsonParser | |
org.apache.commons.lang3.builder.ToStringBuilder | |
com.google.common.io.BaseEncoding | |
org.joda.time.base.AbstractDateTime | |
org.antlr.v4.runtime.Recognizer | |
org.antlr.v4.runtime.Parser | |
org.w3c.dom.Node | |
org.joda.time.DateTimeZone | |
org.objectweb.asm.MethodVisitor | |
com.fasterxml.jackson.databind.DeserializationContext | |
com.thoughtworks.xstream.XStream | |
com.codahale.metrics.Histogram | |
net.sf.json.JSONObject | |
org.dom4j.Document | |
org.w3c.dom.Element |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Go to folders | |
cd /data/uppdatera/github/repos | |
# Get all folders with pom.xml file | |
ls -d */* | parallel 'find {} -name pom.xml -printf '"'"'%h\n'"'"'' > ../pom-folders.txt | |
# Shuffle the list | |
perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < pom-folders.txt > pom-folder.txt | |
# Remove examples | |
cat pom-folder.txt | grep -v examples > pom-folders.txt | |
# Run test suites and store working cases | |
time cat ../pom-folders.txt | parallel 'cd {}; cmd="mvn test -Dmaven.test.skip=false"; $cmd &> /dev/null; [[ $? -eq 0 ]] && echo {}' > ../tests-working.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
com.google.common.collect.Lists.newArrayList() | |
org.apache.commons.lang3.StringUtils.isBlank(java.lang.CharSequence) | |
org.apache.commons.lang3.StringUtils.isEmpty(java.lang.CharSequence) | |
com.google.common.collect.Lists.newArrayList(E...) | |
org.apache.commons.lang3.StringUtils.isNotBlank(java.lang.CharSequence) | |
com.google.common.base.Strings.isNullOrEmpty(java.lang.String) | |
com.google.gson.GsonBuilder.create() | |
com.google.gson.Gson.toJson(java.lang.Object) | |
com.google.common.collect.Maps.newHashMap() | |
com.google.gson.Gson.fromJson(java.lang.String, java.lang.Class<T>) | |
org.apache.http.impl.client.HttpClientBuilder.build() | |
com.google.common.collect.Lists.newArrayList(java.lang.Iterable<? extends E>) | |
org.apache.commons.lang3.StringUtils.isNotEmpty(java.lang.CharSequence) | |
com.google.gson.JsonElement.getAsString() | |
org.apache.commons.io.IOUtils.copy(java.io.InputStream, java.io.OutputStream) | |
com.fasterxml.jackson.databind.ObjectMapper.writeValueAsString(java.lang.Object) | |
com.google.common.base.Objects.equal(java.lang.Object, java.lang.Object) | |
org.apache.commons.io.IOUtils.closeQuietly(java.io.InputStream) | |
org.apache.commons.io.FileUtils.deleteDirectory(java.io.File) | |
org.apache.commons.lang3.StringUtils.join(java.lang.Iterable<?>, java.lang.String) | |
org.jsoup.nodes.Element.text() | |
com.google.gson.JsonParser.parse(java.lang.String) | |
com.google.gson.JsonElement.getAsJsonArray() | |
org.apache.commons.lang3.StringUtils.replace(java.lang.String, java.lang.String, java.lang.String) | |
org.apache.commons.io.IOUtils.toString(java.io.InputStream, java.lang.String) | |
org.apache.commons.io.IOUtils.toString(java.io.InputStream) | |
org.jsoup.nodes.Element.select(java.lang.String) | |
org.jsoup.Jsoup.parse(java.lang.String) | |
com.alibaba.fastjson.JSON.toJSONString(java.lang.Object) | |
org.joda.time.base.BaseDateTime.getMillis() | |
org.antlr.v4.runtime.atn.ATNDeserializer.deserialize(char[]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment