Last active
September 26, 2018 08:16
-
-
Save omalley/5125691 to your computer and use it in GitHub Desktop.
Auto-discovered Apache Hive schema for githubarchive.org's JSON logs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| create table github ( | |
| actor: string, | |
| actor_attributes: struct < | |
| blog: string, | |
| company: string, | |
| email: string, | |
| gravatar_id: binary, | |
| location: string, | |
| login: string, | |
| name: string, | |
| type: string>, | |
| created_at: timestamp, | |
| payload: struct < | |
| action: string, | |
| comment: struct < | |
| _links: struct < | |
| html: struct < | |
| href: string>, | |
| pull_request: struct < | |
| href: string>, | |
| self: struct < | |
| href: string>>, | |
| body: string, | |
| commit_id: binary, | |
| created_at: timestamp, | |
| id: int, | |
| original_commit_id: binary, | |
| original_position: tinyint, | |
| path: string, | |
| position: tinyint, | |
| updated_at: timestamp, | |
| url: string, | |
| user: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: smallint, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>>, | |
| comment_id: int, | |
| commit: binary, | |
| desc: string, | |
| description: string, | |
| head: binary, | |
| id: int, | |
| issue: int, | |
| issue_id: int, | |
| master_branch: string, | |
| member: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>, | |
| name: string, | |
| number: smallint, | |
| pages: array <struct < | |
| action: string, | |
| html_url: string, | |
| page_name: string, | |
| sha: binary, | |
| summary: string, | |
| title: string>>, | |
| pull_request: struct < | |
| _links: struct < | |
| comments: struct < | |
| href: string>, | |
| html: struct < | |
| href: string>, | |
| issue: struct < | |
| href: string>, | |
| review_comments: struct < | |
| href: string>, | |
| self: struct < | |
| href: string>>, | |
| additions: tinyint, | |
| assignee: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>, | |
| base: struct < | |
| label: string, | |
| ref: string, | |
| repo: struct < | |
| archive_url: string, | |
| assignees_url: string, | |
| blobs_url: string, | |
| branches_url: string, | |
| clone_url: string, | |
| collaborators_url: string, | |
| comments_url: string, | |
| commits_url: string, | |
| compare_url: string, | |
| contents_url: string, | |
| contributors_url: string, | |
| created_at: timestamp, | |
| default_branch: string, | |
| description: string, | |
| downloads_url: string, | |
| events_url: string, | |
| fork: boolean, | |
| forks: tinyint, | |
| forks_count: tinyint, | |
| forks_url: string, | |
| full_name: string, | |
| git_commits_url: string, | |
| git_refs_url: string, | |
| git_tags_url: string, | |
| git_url: string, | |
| has_downloads: boolean, | |
| has_issues: boolean, | |
| has_wiki: boolean, | |
| homepage: string, | |
| hooks_url: string, | |
| html_url: string, | |
| id: int, | |
| issue_comment_url: string, | |
| issue_events_url: string, | |
| issues_url: string, | |
| keys_url: string, | |
| labels_url: string, | |
| language: string, | |
| languages_url: string, | |
| master_branch: string, | |
| merges_url: string, | |
| milestones_url: string, | |
| mirror_url: string, | |
| name: string, | |
| notifications_url: string, | |
| open_issues: tinyint, | |
| open_issues_count: tinyint, | |
| owner: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>, | |
| private: boolean, | |
| pulls_url: string, | |
| pushed_at: timestamp, | |
| size: smallint, | |
| ssh_url: string, | |
| stargazers_url: string, | |
| statuses_url: string, | |
| subscribers_url: string, | |
| subscription_url: string, | |
| svn_url: string, | |
| tags_url: string, | |
| teams_url: string, | |
| trees_url: string, | |
| updated_at: timestamp, | |
| url: string, | |
| watchers: tinyint, | |
| watchers_count: tinyint>, | |
| sha: binary, | |
| user: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>>, | |
| body: string, | |
| changed_files: tinyint, | |
| closed_at: timestamp, | |
| comments: tinyint, | |
| comments_url: string, | |
| commits: tinyint, | |
| commits_url: string, | |
| created_at: timestamp, | |
| deletions: tinyint, | |
| diff_url: string, | |
| head: struct < | |
| label: string, | |
| ref: string, | |
| repo: struct < | |
| archive_url: string, | |
| assignees_url: string, | |
| blobs_url: string, | |
| branches_url: string, | |
| clone_url: string, | |
| collaborators_url: string, | |
| comments_url: string, | |
| commits_url: string, | |
| compare_url: string, | |
| contents_url: string, | |
| contributors_url: string, | |
| created_at: timestamp, | |
| default_branch: string, | |
| description: string, | |
| downloads_url: string, | |
| events_url: string, | |
| fork: boolean, | |
| forks: tinyint, | |
| forks_count: tinyint, | |
| forks_url: string, | |
| full_name: string, | |
| git_commits_url: string, | |
| git_refs_url: string, | |
| git_tags_url: string, | |
| git_url: string, | |
| has_downloads: boolean, | |
| has_issues: boolean, | |
| has_wiki: boolean, | |
| homepage: string, | |
| hooks_url: string, | |
| html_url: string, | |
| id: int, | |
| issue_comment_url: string, | |
| issue_events_url: string, | |
| issues_url: string, | |
| keys_url: string, | |
| labels_url: string, | |
| language: string, | |
| languages_url: string, | |
| master_branch: string, | |
| merges_url: string, | |
| milestones_url: string, | |
| mirror_url: string, | |
| name: string, | |
| notifications_url: string, | |
| open_issues: tinyint, | |
| open_issues_count: tinyint, | |
| owner: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>, | |
| private: boolean, | |
| pulls_url: string, | |
| pushed_at: timestamp, | |
| size: smallint, | |
| ssh_url: string, | |
| stargazers_url: string, | |
| statuses_url: string, | |
| subscribers_url: string, | |
| subscription_url: string, | |
| svn_url: string, | |
| tags_url: string, | |
| teams_url: string, | |
| trees_url: string, | |
| updated_at: timestamp, | |
| url: string, | |
| watchers: tinyint, | |
| watchers_count: tinyint>, | |
| sha: binary, | |
| user: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>>, | |
| html_url: string, | |
| id: int, | |
| issue_url: string, | |
| merge_commit_sha: binary, | |
| mergeable: boolean, | |
| mergeable_state: string, | |
| merged: boolean, | |
| merged_at: timestamp, | |
| merged_by: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>, | |
| milestone: struct < | |
| closed_issues: tinyint, | |
| created_at: timestamp, | |
| creator: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>, | |
| description: string, | |
| due_on: timestamp, | |
| id: int, | |
| labels_url: string, | |
| number: tinyint, | |
| open_issues: tinyint, | |
| state: string, | |
| title: string, | |
| updated_at: timestamp, | |
| url: string>, | |
| number: tinyint, | |
| patch_url: string, | |
| review_comment_url: string, | |
| review_comments: tinyint, | |
| review_comments_url: string, | |
| state: string, | |
| title: string, | |
| updated_at: timestamp, | |
| url: string, | |
| user: struct < | |
| avatar_url: string, | |
| events_url: string, | |
| followers_url: string, | |
| following_url: string, | |
| gists_url: string, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| organizations_url: string, | |
| received_events_url: string, | |
| repos_url: string, | |
| starred_url: string, | |
| subscriptions_url: string, | |
| type: string, | |
| url: string>>, | |
| ref: string, | |
| ref_type: string, | |
| shas: array <array <uniontype <boolean,string,timestamp>>>, | |
| size: tinyint, | |
| target: struct < | |
| followers: tinyint, | |
| gravatar_id: binary, | |
| id: int, | |
| login: string, | |
| repos: tinyint>, | |
| url: string>, | |
| public: boolean, | |
| repository: struct < | |
| created_at: timestamp, | |
| description: string, | |
| fork: boolean, | |
| forks: tinyint, | |
| has_downloads: boolean, | |
| has_issues: boolean, | |
| has_wiki: boolean, | |
| homepage: string, | |
| id: int, | |
| integrate_branch: string, | |
| language: string, | |
| master_branch: string, | |
| name: string, | |
| open_issues: tinyint, | |
| organization: string, | |
| owner: string, | |
| private: boolean, | |
| pushed_at: timestamp, | |
| size: smallint, | |
| stargazers: tinyint, | |
| url: string, | |
| watchers: tinyint>, | |
| type: string, | |
| url: string | |
| ) |
Author
Author
The code I used is posted here: https://github.com/hortonworks/hive-json
Author
Ok, the code has been incorporated into the ORC tools.
JSON Schema discovery tool documentation
JSON Schema discovery tool code
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was discovered by looking at the Jan 2013 logs.
events: 5365969
gzip json: 1.6GB
json: 8.3GB