Created
June 24, 2025 19:37
-
-
Save Riduidel/84164f9bdcda6549cfb852822425cdac to your computer and use it in GitHub Desktop.
A JBang script allowing Shaarli links matching some constraints to be transformed into Netscape-style bookmarks for import in Shaarli
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
///usr/bin/env jbang "$0" "$@" ; exit $? | |
//JAVA 21+ | |
//DEPS info.picocli:picocli:4.6.2 | |
//DEPS commons-io:commons-io:2.15.1 | |
//DEPS com.fasterxml.jackson.core:jackson-databind:2.16.1 | |
//DEPS com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.16.1 | |
//DEPS org.apache.commons:commons-text:1.13.1 | |
//DEPS org.jsoup:jsoup:1.20.1 | |
import java.io.IOException; | |
import java.io.UnsupportedEncodingException; | |
import java.net.URI; | |
import java.net.URISyntaxException; | |
import java.net.URLDecoder; | |
import java.nio.charset.Charset; | |
import java.nio.file.Path; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Collection; | |
import java.util.Date; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Optional; | |
import java.util.concurrent.Callable; | |
import java.util.logging.Logger; | |
import java.util.stream.Collectors; | |
import java.util.stream.IntStream; | |
import org.apache.commons.io.FileUtils; | |
import org.apache.commons.lang3.StringUtils; | |
import org.apache.commons.text.StringEscapeUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.nodes.Node; | |
import org.jsoup.nodes.TextNode; | |
import com.fasterxml.jackson.annotation.JsonIgnoreProperties; | |
import com.fasterxml.jackson.annotation.JsonProperty; | |
import com.fasterxml.jackson.annotation.JsonSubTypes; | |
import com.fasterxml.jackson.annotation.JsonSubTypes.Type; | |
import com.fasterxml.jackson.annotation.JsonTypeInfo; | |
import com.fasterxml.jackson.annotation.JsonTypeInfo.As; | |
import com.fasterxml.jackson.annotation.JsonTypeInfo.Id; | |
import com.fasterxml.jackson.core.StreamReadFeature; | |
import com.fasterxml.jackson.core.exc.StreamReadException; | |
import com.fasterxml.jackson.databind.DatabindException; | |
import com.fasterxml.jackson.databind.ObjectMapper; | |
import com.fasterxml.jackson.databind.json.JsonMapper; | |
import picocli.CommandLine; | |
/** | |
* Run that script with `jbang ExtractShaarliLinksFromMastodonArchive.java` | |
*/ | |
@CommandLine.Command( | |
description = {"This application finds in Mastodon post backup the Shaarli links, and transforms these links into", | |
" * a Netscape-compatible bookmarks file."} | |
) | |
public class ExtractShaarliLinksFromMastodonArchive implements Callable<Integer>{ | |
private static final Logger logger = Logger.getLogger(ExtractShaarliLinksFromMastodonArchive.class.getName()); | |
public static void main(String... args) { | |
int exitCode = new CommandLine(new ExtractShaarliLinksFromMastodonArchive()).execute(args); | |
System.exit(exitCode); | |
} | |
@CommandLine.Option(names = {"-i", "--input"}, description = "Input json file.", defaultValue = "outbox.json") | |
private Path input; | |
@CommandLine.Option(names = {"-o", "--output"}, description = "Output html file.", defaultValue = "export.html") | |
private Path output; | |
@CommandLine.Option(names = {"-a", "--after"}, description = "Only include messages after this date.") | |
private Date after; | |
@CommandLine.Option(names = {"-b", "--before"}, description = "Only include messages before this date.") | |
private Date before; | |
@CommandLine.Option(names = {"-s", "--shaarli-server"}, description = "Shaarli server url", defaultValue="nicolas-delsaux.hd.free.fr/Shaarli") | |
private String shaarliServer; | |
@CommandLine.Option(names = {"-t", "--default-tags"}, description = "Default tags for created links", defaultValue="@tool:mastodon-to-shaarl") | |
private List<String> defaultTags; | |
static record ShaarliLink(String description, String url, List<String> tags, Date published) { | |
public String toFirefoxBookmark() { | |
long publishedAt = published.getTime()/1000; | |
Document page; | |
String title; | |
try { | |
page = Jsoup.parse(new URI(url).toURL(), 10000); | |
title = page.title(); | |
} catch (IOException | URISyntaxException e) { | |
title = url; | |
} | |
String htmlDescription = String.format("\n%s<DD>%s</DD>", | |
StringUtils.repeat("\t", tags.size()), | |
description | |
); | |
String htmlLink = String.format("\n%s<A HREF=\"%s\" ADD_DATE=\"%d\" LAST_VISIT=\"%d\" LAST_MODIFIED=\"%d\">%s</A>", | |
StringUtils.repeat("\t", tags.size()), | |
url, | |
publishedAt, | |
publishedAt, | |
publishedAt, | |
title | |
); | |
List<String> beforeList = new ArrayList<String>(); | |
List<String> afterList = new ArrayList<String>(); | |
IntStream.range(0, tags.size()).asLongStream() | |
.forEach(index -> { | |
String tag = tags.get((int) index); | |
String prefix = StringUtils.repeat("\t", (int) index); | |
String before = String.format("%s<DT><H3 ADD_DATE=\"%d\">%s</H3></DT>\n%s<DL><p>", | |
prefix, publishedAt, tag, prefix); | |
String after = String.format("%s</DL><p>", prefix); | |
beforeList.add(before); | |
afterList.addFirst(after); | |
}) | |
; | |
return beforeList.stream().collect(Collectors.joining("\n")) | |
+ htmlLink | |
+ htmlDescription | |
+ "\n" | |
+ afterList.stream().collect(Collectors.joining("\n")); | |
} | |
} | |
@JsonIgnoreProperties(ignoreUnknown = true) | |
static record Tag ( | |
@JsonProperty String type, | |
@JsonProperty String href, | |
@JsonProperty String name | |
) { | |
} | |
@JsonIgnoreProperties(ignoreUnknown = true) | |
static record Content ( | |
@JsonProperty String id, | |
@JsonProperty String type, | |
@JsonProperty String summary, | |
@JsonProperty String inReplyTo, | |
@JsonProperty Date published, | |
@JsonProperty String url, | |
@JsonProperty String attributedTo, | |
@JsonProperty List<String> to, | |
@JsonProperty List<String> cc, | |
@JsonProperty boolean sensitive, | |
@JsonProperty String atomUri, | |
@JsonProperty String inReplyToAtomUri, | |
@JsonProperty String conversation, | |
@JsonProperty String content, | |
@JsonProperty List<Tag> tag | |
) { | |
Optional<ShaarliLink> toShaarliLink(List<String> defaultTags, CharSequence shaarliServer) { | |
try { | |
long tagsCount = tag.stream() | |
.filter(t -> "Hashtag".equals(t.type)) | |
.count(); | |
// I always put tags in Shaarli, always, so I can remove non-tagged messages | |
// And since there can be mentions and other tags types | |
// Let's just count number of tag in "tag" collection | |
if(tagsCount<tag.size()) | |
return Optional.empty(); | |
// Now this will be complicated : we will get back the *real* tag text | |
// (because the tag collection strips accents) | |
// So load content in jsoup | |
Document doc = Jsoup.parseBodyFragment(StringEscapeUtils.unescapeHtml4(content())); | |
Element body = doc.body(); | |
Node paragraph = body.lastChild(); | |
if(!paragraph.nameIs("p")) | |
return Optional.empty(); | |
List<Tag> remainingTags = new ArrayList<Tag>(tag()); | |
List<String> usedTags = new ArrayList<String>(); | |
while(!remainingTags.isEmpty()) { | |
Tag lastTag = remainingTags.removeLast(); | |
// That tag url should be th last element of the last content paragraph | |
Node lastLink = paragraph.lastChild(); | |
if(!lastLink.nameIs("a")) | |
return Optional.empty(); | |
// It's a link, which href should be our last tag link | |
String href = StringUtils.stripAccents(URLDecoder.decode(lastLink.attr("href"), "UTF-8")); | |
if(!lastTag.href.equals(href)) | |
return Optional.empty(); | |
Node linkText = lastLink.lastChild(); | |
if(!linkText.nameIs("span")) | |
return Optional.empty(); | |
String tagText = ((TextNode) linkText.firstChild()).text(); | |
// Of course we insert element first | |
usedTags.add(0, tagText); | |
lastLink.remove(); | |
removeAllUselessTexts(paragraph); | |
} | |
// Now the last element should be a link | |
Node lastLink = paragraph.lastChild(); | |
if(!lastLink.nameIs("a")) | |
return Optional.empty(); | |
String usedUrl = lastLink.attr("href"); | |
lastLink.remove(); | |
removeAllUselessTexts(paragraph); | |
if(usedUrl.contains(shaarliServer)) { | |
return Optional.empty(); | |
} | |
String usedTitle = ""; | |
if (lastLink instanceof Element) { | |
Element lastLinkElement = (Element) lastLink; | |
usedTitle = lastLinkElement.text(); | |
} | |
// Now convert the whole remaining text back to text, and assume it's ok | |
String usedDescription = body.text(); | |
usedTags.addAll(defaultTags); | |
return Optional.of(new ShaarliLink(usedDescription, usedUrl, usedTags, published)); | |
} catch(UnsupportedEncodingException e) { | |
throw new RuntimeException(e); | |
} | |
} | |
void removeAllUselessTexts(Node paragraph) { | |
while(paragraph.lastChild() instanceof TextNode) { | |
String text = ((TextNode) paragraph.lastChild()).text(); | |
if(text.isBlank() || text.trim().equals("-")) { | |
paragraph.lastChild().remove(); | |
} else { | |
return; | |
} | |
} | |
} | |
} | |
@JsonTypeInfo(use = Id.NAME, include = As.PROPERTY, property="type") | |
@JsonSubTypes({ @Type(value=Create.class, name="Create"), @Type(value=Announce.class, name="Announce")}) | |
static interface Message { | |
} | |
@JsonIgnoreProperties(ignoreUnknown = true) | |
static record Announce () implements Message {} | |
@JsonIgnoreProperties(ignoreUnknown = true) | |
static record Create ( | |
@JsonProperty String id, | |
@JsonProperty String type, | |
@JsonProperty String actor, | |
@JsonProperty Date published, | |
@JsonProperty List<String> to, | |
@JsonProperty List<String> cc, | |
@JsonProperty Content object | |
) implements Message {} | |
@JsonIgnoreProperties(ignoreUnknown = true) | |
static record Outbox( | |
@JsonProperty("orderedItems") List<Message> messages) { | |
Collection<Content> findContent(Date publishedAfter, Date publishedBefore) { | |
return messages.stream() | |
.filter(m -> m instanceof Create) | |
.map(c -> (Create)c) | |
.filter(c -> c.published.compareTo(publishedAfter)>=0) | |
.filter(c -> c.published.compareTo(publishedBefore)<=0) | |
.map(c -> c.object) | |
.toList() | |
; | |
} | |
} | |
@Override | |
public Integer call() throws Exception { | |
ExtractShaarliLinksFromMastodonArchive.Outbox outbox = readJson(input); | |
Collection<Content> filteredContent = outbox.findContent( | |
after==null ? new Date(0) : after, | |
before==null ? new Date() : before | |
); | |
Collection<ExtractShaarliLinksFromMastodonArchive.ShaarliLink> links = toShaarliLinks(filteredContent); | |
FileUtils.write(output.toFile(), generateBookmarksFile(links), Charset.forName("UTF-8")); | |
logger.info(String.format("✅ Written bookmarks to %s", output)); | |
return 0; | |
} | |
private String generateBookmarksFile(Collection<ExtractShaarliLinksFromMastodonArchive.ShaarliLink> links) { | |
return links.stream() | |
.map(ShaarliLink::toFirefoxBookmark) | |
.collect(Collectors.joining("\n", | |
""" | |
<!DOCTYPE NETSCAPE-Bookmark-file-1> | |
<!--This is an automatically generated file. | |
It will be read and overwritten. | |
Do Not Edit! --> | |
<Title>Bookmarks</Title> | |
<H1>Bookmarks</H1> | |
<DL> | |
""", "</DL>")); | |
} | |
private Collection<ExtractShaarliLinksFromMastodonArchive.ShaarliLink> toShaarliLinks( | |
Collection<ExtractShaarliLinksFromMastodonArchive.Content> filteredContent) { | |
return filteredContent.stream() | |
.parallel() | |
.flatMap(c -> c.toShaarliLink(defaultTags, shaarliServer).stream()) | |
.toList(); | |
} | |
private ExtractShaarliLinksFromMastodonArchive.Outbox readJson(Path input) throws StreamReadException, DatabindException, IOException { | |
try { | |
logger.info(String.format("⌛️ Reading outbox from %s", input)); | |
ObjectMapper mapper = JsonMapper.builder() | |
.enable(StreamReadFeature.INCLUDE_SOURCE_IN_LOCATION) | |
.build(); | |
return mapper.readValue(input.toFile(), Outbox.class); | |
} finally { | |
logger.info(String.format("✅ Read outbox from %s", input)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment