Created
September 29, 2023 20:24
-
-
Save acheong08/894db3d4dffb7c577dd5773b9598ab95 to your computer and use it in GitHub Desktop.
Convert Twitter Database Dump to SQLite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"database/sql" | |
"fmt" | |
"os" | |
"strings" | |
_ "github.com/mattn/go-sqlite3" | |
) | |
type UserData struct { | |
Email string | |
Name string | |
ScreenName string | |
Followers int | |
CreatedAt string | |
} | |
const batchSize = 1000 | |
func main() { | |
// Read all files in the current directory | |
files, err := os.ReadDir(".") | |
if err != nil { | |
panic(err) | |
} | |
// Process each .txt file | |
for _, f := range files { | |
if strings.HasSuffix(f.Name(), ".txt") { | |
fmt.Printf("Processing file: %s\n", f.Name()) | |
// Generate a database file name based on the text file name | |
dbFileName := fmt.Sprintf("%s.db", strings.TrimSuffix(f.Name(), ".txt")) | |
// Open SQLite database | |
db, err := sql.Open("sqlite3", dbFileName) | |
if err != nil { | |
panic(err) | |
} | |
// Create table if it doesn't exist | |
createTable(db) | |
// Open the file | |
file, err := os.Open(f.Name()) | |
if err != nil { | |
fmt.Printf("Error opening file %s: %v\n", f.Name(), err) | |
continue | |
} | |
// Process the file and insert data into the database | |
processFile(db, file) | |
file.Close() | |
// Close the database | |
db.Close() | |
} | |
} | |
} | |
func createTable(db *sql.DB) { | |
query := ` | |
CREATE TABLE IF NOT EXISTS users ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
email TEXT, | |
name TEXT, | |
screen_name TEXT, | |
followers INTEGER, | |
created_at TEXT | |
) | |
` | |
_, err := db.Exec(query) | |
if err != nil { | |
panic(err) | |
} | |
} | |
func processFile(db *sql.DB, file *os.File) { | |
scanner := bufio.NewScanner(file) | |
records := make([]UserData, 0) | |
for scanner.Scan() { | |
data := parseLine(scanner.Text()) | |
records = append(records, data) | |
// If the batch size is reached, insert the records into the database | |
if len(records) == batchSize { | |
batchInsertIntoDatabase(db, records) | |
records = records[:0] | |
} | |
} | |
// Insert any remaining records | |
if len(records) > 0 { | |
batchInsertIntoDatabase(db, records) | |
} | |
} | |
func parseLine(line string) UserData { | |
parts := strings.Split(line, " - ") | |
var data UserData | |
for _, part := range parts { | |
kv := strings.SplitN(part, ": ", 2) | |
if len(kv) != 2 { | |
continue | |
} | |
key := kv[0] | |
value := kv[1] | |
switch key { | |
case "Email": | |
data.Email = value | |
case "Name": | |
data.Name = value | |
case "ScreenName": | |
data.ScreenName = value | |
case "Followers": | |
fmt.Sscanf(value, "%d", &data.Followers) | |
case "Created At": | |
data.CreatedAt = value | |
} | |
} | |
return data | |
} | |
func batchInsertIntoDatabase(db *sql.DB, records []UserData) { | |
tx, err := db.Begin() | |
if err != nil { | |
panic(err) | |
} | |
stmt, err := tx.Prepare("INSERT INTO users (email, name, screen_name, followers, created_at) VALUES (?, ?, ?, ?, ?)") | |
if err != nil { | |
panic(err) | |
} | |
defer stmt.Close() | |
for _, record := range records { | |
_, err := stmt.Exec(record.Email, record.Name, record.ScreenName, record.Followers, record.CreatedAt) | |
if err != nil { | |
fmt.Printf("Error inserting record: %v\n", err) | |
} | |
} | |
tx.Commit() | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment