Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save Crydust/d01ad6ad85060acbc248d9445f7f5e60 to your computer and use it in GitHub Desktop.

Select an option

Save Crydust/d01ad6ad85060acbc248d9445f7f5e60 to your computer and use it in GitHub Desktop.
Convert Pocket Csv Export To Bookmarks Html
package be.crydust;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.text.StringEscapeUtils;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.11.0</version>
</dependency>
</dependencies>
*/
public class Main {
public static final ZoneId TIMEZONE = ZoneId.of("Europe/Brussels");
record Bookmark(String title, String url, long timeAdded) {
LocalDateTime timeAddedAsLocalDateTime() {
return ZonedDateTime.ofInstant(Instant.ofEpochSecond(timeAdded), TIMEZONE).toLocalDateTime();
}
}
public static void main(String[] args) throws IOException {
Path input = Path.of("/home/kristof/Downloads/pocket/part_000000.csv");
Path output = Path.of("/home/kristof/Downloads/pocket/part_000000.bookmarks.html");
List<Bookmark> bookmarks = readBookmarksCsv(input);
retrieveMissingTitles(bookmarks);
// for (Bookmark bookmark : bookmarks) {
// System.out.println(bookmark.timeAddedAsLocalDateTime() + " - " + bookmark.title + " - " + bookmark.url);
// }
StringBuilder sb = toHtml(bookmarks);
// System.out.println(sb.toString());
Files.writeString(output, sb.toString(), StandardCharsets.UTF_8);
}
private static StringBuilder toHtml(List<Bookmark> bookmarks) {
String prefix = """
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<meta http-equiv="Content-Security-Policy"
content="default-src 'self'; script-src 'none'; img-src data: *; object-src 'none'"></meta>
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks Menu</H1>
<DL><p>
<DT><H3 ADD_DATE="1748701919" LAST_MODIFIED="1748701919">Pocket</H3>
<DL><p>
""";
String line = " <DT><A HREF=\"%s\" ADD_DATE=\"%s\" LAST_MODIFIED=\"%s\">%s</A>";
String suffix = """
</p></DL>
</p></DL>
""";
StringBuilder sb = new StringBuilder();
sb.append(prefix);
for (Bookmark bookmark : bookmarks) {
sb.append(String.format(
line,
StringEscapeUtils.escapeHtml4(bookmark.url),
bookmark.timeAdded,
bookmark.timeAdded,
StringEscapeUtils.escapeHtml4(bookmark.title)
));
}
sb.append(suffix);
return sb;
}
private static void retrieveMissingTitles(List<Bookmark> bookmarks) {
Duration timeout = Duration.ofSeconds(5);
try (HttpClient httpClient = HttpClient.newBuilder()
.connectTimeout(timeout)
.build()) {
ListIterator<Bookmark> iter = bookmarks.listIterator();
while (iter.hasNext()) {
Bookmark bookmark = iter.next();
if (bookmark.title.equals(bookmark.url)) {
String html = "";
try {
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(bookmark.url))
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
.GET()
.timeout(timeout)
.build();
HttpResponse<String> response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
html = response.body();
} catch (IOException | InterruptedException | IllegalArgumentException e) {
System.err.println("Error fetching URL: " + bookmark.url + " - " + e.getMessage());
continue;
}
Pattern titlePattern = Pattern.compile("(?i)<title>([^<>]+?)</title>");
Matcher titeMatcher = titlePattern.matcher(html);
if (titeMatcher.find()) {
String correctTitle = StringEscapeUtils.unescapeHtml4(titeMatcher.group(1)).trim();
// System.out.println("correctTitle = " + correctTitle);
iter.set(new Bookmark(correctTitle, bookmark.url, bookmark.timeAdded));
}
}
}
}
}
private static List<Bookmark> readBookmarksCsv(Path path) {
List<Bookmark> bookmarks = new ArrayList<>();
try (CSVParser parser = CSVParser.parse(
path,
StandardCharsets.UTF_8,
CSVFormat.DEFAULT.builder()
.setHeader("title", "url", "time_added", "tags", "status")
.setSkipHeaderRecord(true)
.build()
)) {
for (CSVRecord record : parser) {
String title = record.get("title");
String url = record.get("url");
String timeAdded = record.get("time_added");
String tags = record.get("tags");
String status = record.get("status");
// Process each record as needed
// System.out.printf("Title: %s, URL: %s%n", title, url);
bookmarks.add(new Bookmark(title, url, Long.parseLong(timeAdded)));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
bookmarks.sort(Comparator.comparingLong(b -> b.timeAdded));
return bookmarks;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment