Skip to content

Instantly share code, notes, and snippets.

@smothiki
Last active June 27, 2023 16:02
Show Gist options
  • Save smothiki/ef27af8bca9af19fcdeb12ebfcc2073c to your computer and use it in GitHub Desktop.
Save smothiki/ef27af8bca9af19fcdeb12ebfcc2073c to your computer and use it in GitHub Desktop.
cassandra tables

Files Tables

file ID text (Primary key)
Size Bigint ( Bytes) Max is 1000000000
Sha256, Sha1, md5  text (File process worker updates during file creation)
Names set <text> 
first_submission_Date int (inserted during file creation)
last analysis date int (updated for each rescan)
AnalysisID map <int, text> ( int is the date) Get the earliest analysis ID 
file-extension text
file-Type-description text ( file details) 
Tags set <text> ( file header info like installer type along with other metadata tags)

Analysis ID can be map. Since no parallel updates happen overwrites are avoided

Analysis table

AnalysisID, AnalyzerName text (Primary key)
fileID text
type text ( Antivirus or metadata script name)
AnalysismetaData Blob ( JSON for av engine or metadata )

Job_status table

Status text (Primary key) 
timestamp int 
Analysis ID text (Primary key)
Analyzer list text 
( In queue when adding the event to the queue) ( In progress when job manager send the events to the job queue)
( completed once the reconciler sees all the AV results)

File_chunk_DB

sha text chunkID

A config Table

analyzer list text file size limit text 100KB-1GB

Scan table

fileID (primary key) Status inqueue time int ( Remove entry when scan is done)

{
    "doc": {
        "mappings": {
            properties:{
            "capabilities_tags":  {
            "type": "nested"
             "fields": {..}
            }
            "creation_date": {
            type : integer
            }
            "crowdsourced_ids_results": 
            type: "nested",
               :tags fields:
                        {
                            "dest_ip": "<string>",
                            "dest_port": <int>,
                            "hostname": "<string>",
                            "protocol": "<string>",
                            "src_ip": "<string>",
                            "src_port": <int>,
                            "url": "<string>"
                        }
                    ],
                    "alert_severity": "<string>",
                    "rule_category": "<string>",
                    "rule_id": "<string>",
                    "rule_msg": "<string>",
                    "rule_source": "<string>"
                }
            ],
            "crowdsourced_ids_stats": {
                "info": <int>,
                "high": <int>,
                "low": <int>,
                "medium": <int>
            },
            "crowdsourced_yara_results": [
                {
                    "description": "<string>",
                    "match_in_subfile": <boolean>,
                    "rule_name": "<string>",
                    "ruleset_id": "<string>",
                    "ruleset_name": "<string>",
                    "source": "<string>"
                }
            ],
            "downloadable": <bool>,
            "first_submission_date": <int:timestamp>,
            "last_analysis_date": <int:timestamp>,
            "last_analysis_results": {
                "<string:engine_name>": {
                    "category": "<string>",
                    "engine_name": "<string>",
                    "engine_update": "<string>",
                    "engine_version": "<string>",
                    "method": "<string>",
                    "result": "<string>"
                }
            },
            "last_analysis_stats": {
                "confirmed-timeout": <int>,
                "failure": <int>,
                "harmless": <int>,
                "malicious": <int>,
                "suspicious": <int>,
                "timeout": <int>,
                "type-unsupported": <int>,
                "undetected": <int>
            },
            "last_modification_date": <int:timestamp>,
            "last_submission_date": <int:timestamp>,
            "md5": "<string>",
            "meaningful_name": "<string>",
            "names": [
                "<strings>",...
            ],
            "reputation": <int>,
            "sandbox_verdicts": {
                "<string:sandbox_name>": {
                    "category": "<string>",
                    "confidence": <int>,
                    "malware_classification": [
                        "<string>"
                    ],
                    "malware_names": [
                        "<string>"
                    ],
                    "sandbox_name": "<string>"
                }
            },
            "sha1": "<string>",
            "sha256": "<string>",
            "sigma_analysis_results": [{
              "rule_title": "<string>",
              "rule_source": "<string>",
              "match_context": [{
                "values": {
                  "<string>": "<string>"}}],
              "rule_level": "<string>",
              "rule_description": "<string>",
              "rule_author": "<string>",
              "rule_id": "<string>"
            }],
            "sigma_analysis_stats": {
                "critical": <int>,
                "high": <int>,
                "low": <int>,
                "medium": <int>
            },
            "sigma_analysis_summary": {
                "<string:ruleset_name>": {
                    "critical": <int>,
                    "high": <int>,
                    "low": <int>,
                    "medium": <int>
                }
            },
            "size": <int>,
            "tags": [
                "<strings>",...
            ],
            "times_submitted": <int>,
            "type_description": "<string>",
            "type_extension": "<string>",
            "type_tag": "<string>",
            "unique_sources": <int>,
            "vhash": "<string>"
        },
        "id": "<SHA256>"
        "type": "file"
    }
}
```
Submitting a file for rescan?
Wait for current analysis to finish. Then proceed with rescan
What is a timeout?
Datawarehousing?
```
FileProcess topic
FileID text
analysisID text
FileTags []string that are relevant to the jobs and AV engines and metadata scripts
Groups
Each AV or metadata script manager is a group
Metadata
FileID, analysis ID, analyzer name , metadata json result

job status update queries

select analysisID, analyzernames  from job_status where status="in-queue" LIMIT 200
select analyzername, analysisID from analysis where analysis IN (list of analysis from above query)
if all analyzernames are present for an analysis ID from 
{
  update status to complete for analysis ID 
}

---
swagger: '2.0'
info:
title: virustotalClone
version: 0.1.0
description: ' A malware analysis tool'
basePath: /api/v1
schemas:
- http
- https
consumes:
- application/json
produces:
- application/json
securityDefinitions:
Bearer:
type: apiKey
name: Authorization
in: header
security:
- Bearer: []
paths:
/XLfile:
get:
summary: "Get a large file upload link"
operationId: "GetLargeFileUploadLink"
responses:
200:
description: "List models response"
schema:
link:
type: string
400:
description: unexpected error
schema:
$ref: '#/definitions/Error'
/files:
post:
description: Upload a file
operationId: UploadFile
consumes:
- multipart/form-data
parameters:
content:
multipart/form-data:
schema:
"$ref": "#/definitions/malwarefile"
required: true
responses:
'200':
description: Return an analysis result
content:
application/json:
schema:
"$ref": "#/definitions/analysisID"
'400':
description: Validation Error
content:
application/json:
schema:
"$ref": "#/definitions/Error"
get:
summary: "Search entire file corpus for a given query string"
operationId: "SearchFiles"
parameters:
- name: page_size
in: query
description: "Page size is an optional argument for number of entries to return in one page."
required: false
type: integer
format: int32
- name: page_token
in: query
description: "Page token is an optional argument for specifying which page of results to get."
required: false
type: string
- name: search_filter
in: query
description: "Search filter is an optional HTTP parameter to filter results by."
required: True
type: string
responses:
200:
description: "List models response"
schema:
$ref: '#/definitions/FileInformationList'
400:
description: unexpected error
schema:
$ref: '#/definitions/Error'
/files/{id}:
get:
summary: "Get a File Scan informaiton"
operationId: GetModel
responses:
200:
description: "A successful model response"
schema:
$ref: '#/definitions/FileInformation'
default:
description: unexpected error
schema:
$ref: '#/definitions/Error'
parameters:
- name: "id"
description: "SHA256 sum of a file"
required: true
in: path
type: string
put:
tags:
summary: "Rescan a File Scan informaiton"
operationId: ReScanFile
parameters:
- name: id
in: path
description: "sha256 sum of file"
required: true
responses:
200:
description: "A successful model response"
schema:
$ref: '#/definitions/FileInformation'
default:
description: unexpected error
schema:
$ref: '#/definitions/Error'
/analysis/{id}:
get:
summary: "Get analysis information based on analysis ID"
operationId: GetAnalysisByID
responses:
200:
description: "A successful model response"
schema:
$ref: '#/definitions/Analysis'
default:
description: unexpected error
schema:
$ref: '#/definitions/Error'
parameters:
- name: "id"
description: "Analysis ID string"
required: true
in: path
type: string
definitions:
Tag:
type: object
properties:
key:
type: string
description: "The tag key."
value:
type: string
description: "the tag value"
description: "Tag is used to add more metadata."
malwarefile:
title: malwarefile
required:
- file
type: object
properties:
file:
title: File
type: string
format: binary
analysisID:
properties:
title: File
type: string
Error:
title: Error
type: object
properties:
msg:
title: Message
type: string
FileInformationList:
type: object
properties:
files:
type: array
$ref: '#/definitions/FileInformation'
FileInformation:
type: object
title: Infomation of a file
properties:
names:
type: array
items:
type: string
hash:
"$ref": "#/components/schemas/Hash"
tags:
type: "array"
items:
$ref: "#/definitions/Tag"
last_submission_date:
type: interger
format: int64
size:
title: Size in Bytes
type: integer
format: int64
extension:
title: Extension
type: string
description:
title: description
type: string
unique_sources:
type: array
items:
type: string
type_tags:
type: array
items:
type: string
last_analysis_result:
$ref: "#/definitions/Analysis"
# sandbox_verdicts:? same as analysis result redundant?
metadata_script_results:
type: "array"
items:
$ref: "#/definitions/GenericResult"
Hash:
title: Hash
required:
- md5
- sha1
- sha256
type: object
properties:
md5:
title: Md5
type: string
sha1:
title: Sha1
type: string
sha256:
title: Sha256
type: string
description: |-
used to identify file uniquely based on sha checksum
Analysis:
title: Analysis of a file
type: object
properties:
fileID:
description: sha256 sum of file the analysis belongs to
type: string
analysisID:
type: string
date:
type: integer
format: int64
type:
type: string
description: analysis of a file and value is always be file
results:
type: array
$ref: "#/definitions/AnalysisResult"
stats:
type: object
$ref: "#/definitions/AnalysisStats"
AnalysisResult:
title: Analysis result of each AV
type: object
properties:
category:
type: string
enum: [ "timeout", "failure","harmless","suspicious","malicious","not-supported" ]
engine_name:
type: string
engine_update:
type: string
engine_version:
type: string
method:
type: string
result:
type: string
AnalysisStats:
type: object
properties:
confirmed-timeout:
type: number
failure:
type: number
malicious:
type: number
suspicioud:
type: number
timeout:
type: number
undetected:
type: number
not-supported:
type: number
GenericResult:
type: object
properties:
Name:
type: string
Tags:
type: array
items:
$ref: "#/definitions/Tag"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment