Created
May 31, 2025 14:43
-
-
Save Crydust/d01ad6ad85060acbc248d9445f7f5e60 to your computer and use it in GitHub Desktop.
Convert Pocket Csv Export To Bookmarks Html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package be.crydust; | |
| import org.apache.commons.csv.CSVFormat; | |
| import org.apache.commons.csv.CSVParser; | |
| import org.apache.commons.csv.CSVRecord; | |
| import org.apache.commons.text.StringEscapeUtils; | |
| import java.io.IOException; | |
| import java.net.URI; | |
| import java.net.http.HttpClient; | |
| import java.net.http.HttpRequest; | |
| import java.net.http.HttpResponse; | |
| import java.nio.charset.StandardCharsets; | |
| import java.nio.file.Files; | |
| import java.nio.file.Path; | |
| import java.time.Duration; | |
| import java.time.Instant; | |
| import java.time.LocalDateTime; | |
| import java.time.ZoneId; | |
| import java.time.ZonedDateTime; | |
| import java.util.ArrayList; | |
| import java.util.Comparator; | |
| import java.util.List; | |
| import java.util.ListIterator; | |
| import java.util.regex.Matcher; | |
| import java.util.regex.Pattern; | |
| /* | |
| <dependencies> | |
| <dependency> | |
| <groupId>org.apache.commons</groupId> | |
| <artifactId>commons-csv</artifactId> | |
| <version>1.14.0</version> | |
| </dependency> | |
| <dependency> | |
| <groupId>org.apache.commons</groupId> | |
| <artifactId>commons-text</artifactId> | |
| <version>1.11.0</version> | |
| </dependency> | |
| </dependencies> | |
| */ | |
| public class Main { | |
| public static final ZoneId TIMEZONE = ZoneId.of("Europe/Brussels"); | |
| record Bookmark(String title, String url, long timeAdded) { | |
| LocalDateTime timeAddedAsLocalDateTime() { | |
| return ZonedDateTime.ofInstant(Instant.ofEpochSecond(timeAdded), TIMEZONE).toLocalDateTime(); | |
| } | |
| } | |
| public static void main(String[] args) throws IOException { | |
| Path input = Path.of("/home/kristof/Downloads/pocket/part_000000.csv"); | |
| Path output = Path.of("/home/kristof/Downloads/pocket/part_000000.bookmarks.html"); | |
| List<Bookmark> bookmarks = readBookmarksCsv(input); | |
| retrieveMissingTitles(bookmarks); | |
| // for (Bookmark bookmark : bookmarks) { | |
| // System.out.println(bookmark.timeAddedAsLocalDateTime() + " - " + bookmark.title + " - " + bookmark.url); | |
| // } | |
| StringBuilder sb = toHtml(bookmarks); | |
| // System.out.println(sb.toString()); | |
| Files.writeString(output, sb.toString(), StandardCharsets.UTF_8); | |
| } | |
| private static StringBuilder toHtml(List<Bookmark> bookmarks) { | |
| String prefix = """ | |
| <!DOCTYPE NETSCAPE-Bookmark-file-1> | |
| <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> | |
| <meta http-equiv="Content-Security-Policy" | |
| content="default-src 'self'; script-src 'none'; img-src data: *; object-src 'none'"></meta> | |
| <TITLE>Bookmarks</TITLE> | |
| <H1>Bookmarks Menu</H1> | |
| <DL><p> | |
| <DT><H3 ADD_DATE="1748701919" LAST_MODIFIED="1748701919">Pocket</H3> | |
| <DL><p> | |
| """; | |
| String line = " <DT><A HREF=\"%s\" ADD_DATE=\"%s\" LAST_MODIFIED=\"%s\">%s</A>"; | |
| String suffix = """ | |
| </p></DL> | |
| </p></DL> | |
| """; | |
| StringBuilder sb = new StringBuilder(); | |
| sb.append(prefix); | |
| for (Bookmark bookmark : bookmarks) { | |
| sb.append(String.format( | |
| line, | |
| StringEscapeUtils.escapeHtml4(bookmark.url), | |
| bookmark.timeAdded, | |
| bookmark.timeAdded, | |
| StringEscapeUtils.escapeHtml4(bookmark.title) | |
| )); | |
| } | |
| sb.append(suffix); | |
| return sb; | |
| } | |
| private static void retrieveMissingTitles(List<Bookmark> bookmarks) { | |
| Duration timeout = Duration.ofSeconds(5); | |
| try (HttpClient httpClient = HttpClient.newBuilder() | |
| .connectTimeout(timeout) | |
| .build()) { | |
| ListIterator<Bookmark> iter = bookmarks.listIterator(); | |
| while (iter.hasNext()) { | |
| Bookmark bookmark = iter.next(); | |
| if (bookmark.title.equals(bookmark.url)) { | |
| String html = ""; | |
| try { | |
| HttpRequest request = HttpRequest.newBuilder() | |
| .uri(URI.create(bookmark.url)) | |
| .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
| .GET() | |
| .timeout(timeout) | |
| .build(); | |
| HttpResponse<String> response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); | |
| html = response.body(); | |
| } catch (IOException | InterruptedException | IllegalArgumentException e) { | |
| System.err.println("Error fetching URL: " + bookmark.url + " - " + e.getMessage()); | |
| continue; | |
| } | |
| Pattern titlePattern = Pattern.compile("(?i)<title>([^<>]+?)</title>"); | |
| Matcher titeMatcher = titlePattern.matcher(html); | |
| if (titeMatcher.find()) { | |
| String correctTitle = StringEscapeUtils.unescapeHtml4(titeMatcher.group(1)).trim(); | |
| // System.out.println("correctTitle = " + correctTitle); | |
| iter.set(new Bookmark(correctTitle, bookmark.url, bookmark.timeAdded)); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| private static List<Bookmark> readBookmarksCsv(Path path) { | |
| List<Bookmark> bookmarks = new ArrayList<>(); | |
| try (CSVParser parser = CSVParser.parse( | |
| path, | |
| StandardCharsets.UTF_8, | |
| CSVFormat.DEFAULT.builder() | |
| .setHeader("title", "url", "time_added", "tags", "status") | |
| .setSkipHeaderRecord(true) | |
| .build() | |
| )) { | |
| for (CSVRecord record : parser) { | |
| String title = record.get("title"); | |
| String url = record.get("url"); | |
| String timeAdded = record.get("time_added"); | |
| String tags = record.get("tags"); | |
| String status = record.get("status"); | |
| // Process each record as needed | |
| // System.out.printf("Title: %s, URL: %s%n", title, url); | |
| bookmarks.add(new Bookmark(title, url, Long.parseLong(timeAdded))); | |
| } | |
| } catch (IOException e) { | |
| throw new RuntimeException(e); | |
| } | |
| bookmarks.sort(Comparator.comparingLong(b -> b.timeAdded)); | |
| return bookmarks; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment