Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Riduidel/84164f9bdcda6549cfb852822425cdac to your computer and use it in GitHub Desktop.
Save Riduidel/84164f9bdcda6549cfb852822425cdac to your computer and use it in GitHub Desktop.
A JBang script allowing Shaarli links matching some constraints to be transformed into Netscape-style bookmarks for import in Shaarli
///usr/bin/env jbang "$0" "$@" ; exit $?
//JAVA 21+
//DEPS info.picocli:picocli:4.6.2
//DEPS commons-io:commons-io:2.15.1
//DEPS com.fasterxml.jackson.core:jackson-databind:2.16.1
//DEPS com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.16.1
//DEPS org.apache.commons:commons-text:1.13.1
//DEPS org.jsoup:jsoup:1.20.1
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.Callable;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonSubTypes.Type;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import com.fasterxml.jackson.annotation.JsonTypeInfo.As;
import com.fasterxml.jackson.annotation.JsonTypeInfo.Id;
import com.fasterxml.jackson.core.StreamReadFeature;
import com.fasterxml.jackson.core.exc.StreamReadException;
import com.fasterxml.jackson.databind.DatabindException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.json.JsonMapper;
import picocli.CommandLine;
/**
* Run that script with `jbang ExtractShaarliLinksFromMastodonArchive.java`
*/
@CommandLine.Command(
description = {"This application finds in Mastodon post backup the Shaarli links, and transforms these links into",
" * a Netscape-compatible bookmarks file."}
)
public class ExtractShaarliLinksFromMastodonArchive implements Callable<Integer>{
private static final Logger logger = Logger.getLogger(ExtractShaarliLinksFromMastodonArchive.class.getName());
public static void main(String... args) {
int exitCode = new CommandLine(new ExtractShaarliLinksFromMastodonArchive()).execute(args);
System.exit(exitCode);
}
@CommandLine.Option(names = {"-i", "--input"}, description = "Input json file.", defaultValue = "outbox.json")
private Path input;
@CommandLine.Option(names = {"-o", "--output"}, description = "Output html file.", defaultValue = "export.html")
private Path output;
@CommandLine.Option(names = {"-a", "--after"}, description = "Only include messages after this date.")
private Date after;
@CommandLine.Option(names = {"-b", "--before"}, description = "Only include messages before this date.")
private Date before;
@CommandLine.Option(names = {"-s", "--shaarli-server"}, description = "Shaarli server url", defaultValue="nicolas-delsaux.hd.free.fr/Shaarli")
private String shaarliServer;
@CommandLine.Option(names = {"-t", "--default-tags"}, description = "Default tags for created links", defaultValue="@tool:mastodon-to-shaarl")
private List<String> defaultTags;
static record ShaarliLink(String description, String url, List<String> tags, Date published) {
public String toFirefoxBookmark() {
long publishedAt = published.getTime()/1000;
Document page;
String title;
try {
page = Jsoup.parse(new URI(url).toURL(), 10000);
title = page.title();
} catch (IOException | URISyntaxException e) {
title = url;
}
String htmlDescription = String.format("\n%s<DD>%s</DD>",
StringUtils.repeat("\t", tags.size()),
description
);
String htmlLink = String.format("\n%s<A HREF=\"%s\" ADD_DATE=\"%d\" LAST_VISIT=\"%d\" LAST_MODIFIED=\"%d\">%s</A>",
StringUtils.repeat("\t", tags.size()),
url,
publishedAt,
publishedAt,
publishedAt,
title
);
List<String> beforeList = new ArrayList<String>();
List<String> afterList = new ArrayList<String>();
IntStream.range(0, tags.size()).asLongStream()
.forEach(index -> {
String tag = tags.get((int) index);
String prefix = StringUtils.repeat("\t", (int) index);
String before = String.format("%s<DT><H3 ADD_DATE=\"%d\">%s</H3></DT>\n%s<DL><p>",
prefix, publishedAt, tag, prefix);
String after = String.format("%s</DL><p>", prefix);
beforeList.add(before);
afterList.addFirst(after);
})
;
return beforeList.stream().collect(Collectors.joining("\n"))
+ htmlLink
+ htmlDescription
+ "\n"
+ afterList.stream().collect(Collectors.joining("\n"));
}
}
@JsonIgnoreProperties(ignoreUnknown = true)
static record Tag (
@JsonProperty String type,
@JsonProperty String href,
@JsonProperty String name
) {
}
@JsonIgnoreProperties(ignoreUnknown = true)
static record Content (
@JsonProperty String id,
@JsonProperty String type,
@JsonProperty String summary,
@JsonProperty String inReplyTo,
@JsonProperty Date published,
@JsonProperty String url,
@JsonProperty String attributedTo,
@JsonProperty List<String> to,
@JsonProperty List<String> cc,
@JsonProperty boolean sensitive,
@JsonProperty String atomUri,
@JsonProperty String inReplyToAtomUri,
@JsonProperty String conversation,
@JsonProperty String content,
@JsonProperty List<Tag> tag
) {
Optional<ShaarliLink> toShaarliLink(List<String> defaultTags, CharSequence shaarliServer) {
try {
long tagsCount = tag.stream()
.filter(t -> "Hashtag".equals(t.type))
.count();
// I always put tags in Shaarli, always, so I can remove non-tagged messages
// And since there can be mentions and other tags types
// Let's just count number of tag in "tag" collection
if(tagsCount<tag.size())
return Optional.empty();
// Now this will be complicated : we will get back the *real* tag text
// (because the tag collection strips accents)
// So load content in jsoup
Document doc = Jsoup.parseBodyFragment(StringEscapeUtils.unescapeHtml4(content()));
Element body = doc.body();
Node paragraph = body.lastChild();
if(!paragraph.nameIs("p"))
return Optional.empty();
List<Tag> remainingTags = new ArrayList<Tag>(tag());
List<String> usedTags = new ArrayList<String>();
while(!remainingTags.isEmpty()) {
Tag lastTag = remainingTags.removeLast();
// That tag url should be th last element of the last content paragraph
Node lastLink = paragraph.lastChild();
if(!lastLink.nameIs("a"))
return Optional.empty();
// It's a link, which href should be our last tag link
String href = StringUtils.stripAccents(URLDecoder.decode(lastLink.attr("href"), "UTF-8"));
if(!lastTag.href.equals(href))
return Optional.empty();
Node linkText = lastLink.lastChild();
if(!linkText.nameIs("span"))
return Optional.empty();
String tagText = ((TextNode) linkText.firstChild()).text();
// Of course we insert element first
usedTags.add(0, tagText);
lastLink.remove();
removeAllUselessTexts(paragraph);
}
// Now the last element should be a link
Node lastLink = paragraph.lastChild();
if(!lastLink.nameIs("a"))
return Optional.empty();
String usedUrl = lastLink.attr("href");
lastLink.remove();
removeAllUselessTexts(paragraph);
if(usedUrl.contains(shaarliServer)) {
return Optional.empty();
}
String usedTitle = "";
if (lastLink instanceof Element) {
Element lastLinkElement = (Element) lastLink;
usedTitle = lastLinkElement.text();
}
// Now convert the whole remaining text back to text, and assume it's ok
String usedDescription = body.text();
usedTags.addAll(defaultTags);
return Optional.of(new ShaarliLink(usedDescription, usedUrl, usedTags, published));
} catch(UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
void removeAllUselessTexts(Node paragraph) {
while(paragraph.lastChild() instanceof TextNode) {
String text = ((TextNode) paragraph.lastChild()).text();
if(text.isBlank() || text.trim().equals("-")) {
paragraph.lastChild().remove();
} else {
return;
}
}
}
}
@JsonTypeInfo(use = Id.NAME, include = As.PROPERTY, property="type")
@JsonSubTypes({ @Type(value=Create.class, name="Create"), @Type(value=Announce.class, name="Announce")})
static interface Message {
}
@JsonIgnoreProperties(ignoreUnknown = true)
static record Announce () implements Message {}
@JsonIgnoreProperties(ignoreUnknown = true)
static record Create (
@JsonProperty String id,
@JsonProperty String type,
@JsonProperty String actor,
@JsonProperty Date published,
@JsonProperty List<String> to,
@JsonProperty List<String> cc,
@JsonProperty Content object
) implements Message {}
@JsonIgnoreProperties(ignoreUnknown = true)
static record Outbox(
@JsonProperty("orderedItems") List<Message> messages) {
Collection<Content> findContent(Date publishedAfter, Date publishedBefore) {
return messages.stream()
.filter(m -> m instanceof Create)
.map(c -> (Create)c)
.filter(c -> c.published.compareTo(publishedAfter)>=0)
.filter(c -> c.published.compareTo(publishedBefore)<=0)
.map(c -> c.object)
.toList()
;
}
}
@Override
public Integer call() throws Exception {
ExtractShaarliLinksFromMastodonArchive.Outbox outbox = readJson(input);
Collection<Content> filteredContent = outbox.findContent(
after==null ? new Date(0) : after,
before==null ? new Date() : before
);
Collection<ExtractShaarliLinksFromMastodonArchive.ShaarliLink> links = toShaarliLinks(filteredContent);
FileUtils.write(output.toFile(), generateBookmarksFile(links), Charset.forName("UTF-8"));
logger.info(String.format("✅ Written bookmarks to %s", output));
return 0;
}
private String generateBookmarksFile(Collection<ExtractShaarliLinksFromMastodonArchive.ShaarliLink> links) {
return links.stream()
.map(ShaarliLink::toFirefoxBookmark)
.collect(Collectors.joining("\n",
"""
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!--This is an automatically generated file.
It will be read and overwritten.
Do Not Edit! -->
<Title>Bookmarks</Title>
<H1>Bookmarks</H1>
<DL>
""", "</DL>"));
}
private Collection<ExtractShaarliLinksFromMastodonArchive.ShaarliLink> toShaarliLinks(
Collection<ExtractShaarliLinksFromMastodonArchive.Content> filteredContent) {
return filteredContent.stream()
.parallel()
.flatMap(c -> c.toShaarliLink(defaultTags, shaarliServer).stream())
.toList();
}
private ExtractShaarliLinksFromMastodonArchive.Outbox readJson(Path input) throws StreamReadException, DatabindException, IOException {
try {
logger.info(String.format("⌛️ Reading outbox from %s", input));
ObjectMapper mapper = JsonMapper.builder()
.enable(StreamReadFeature.INCLUDE_SOURCE_IN_LOCATION)
.build();
return mapper.readValue(input.toFile(), Outbox.class);
} finally {
logger.info(String.format("✅ Read outbox from %s", input));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment