Skip to content

Instantly share code, notes, and snippets.

@etissieres
Last active August 18, 2023 09:17
Show Gist options
  • Save etissieres/3f71ab17043409e9ab14b088ea2d7b01 to your computer and use it in GitHub Desktop.
Save etissieres/3f71ab17043409e9ab14b088ea2d7b01 to your computer and use it in GitHub Desktop.
Browser HTML bookmarks file parsing
package webbr
import (
"context"
"errors"
"io"
"net/url"
"regexp"
"slices"
"golang.org/x/net/html"
)
type Bookmark struct {
Name string
Url *url.URL
Folders []string
}
var (
spaces = regexp.MustCompile(`\s+`)
foldersToIgnore = []string{
"Barre de favoris",
"Autres favoris",
"Favoris sur mobile",
"Bookmarks bar",
"Other bookmarks",
"Mobile bookmarks",
}
)
func ParseBookmarks(ctx context.Context, r io.Reader) <-chan *Bookmark {
ch := make(chan *Bookmark)
go func() {
parseBookmarks(ctx, r, ch)
close(ch)
}()
return ch
}
func parseBookmarks(ctx context.Context, r io.Reader, ch chan *Bookmark) {
var (
isLink bool
isFolder bool
bookmark *Bookmark = &Bookmark{}
folders []string
)
tzr := html.NewTokenizer(r)
for {
ttype := tzr.Next()
switch ttype {
case html.ErrorToken:
return
case html.StartTagToken:
t := tzr.Token()
isLink = t.Data == "a"
if isLink {
href, err := parseHref(t)
if err != nil {
isLink = false
} else {
bookmark.Url = href
bookmark.Folders = make([]string, len(folders))
copy(bookmark.Folders, folders)
}
} else if t.Data == "h3" {
isFolder = true
}
case html.EndTagToken:
t := tzr.Token()
if len(folders) > 0 && t.Data == "dl" {
folders = folders[:len(folders)-1]
}
case html.TextToken:
t := tzr.Token()
if isLink {
bookmark.Name = spaces.ReplaceAllString(t.Data, " ")
isLink, isFolder = false, false
select {
case <-ctx.Done():
return
case ch <- bookmark:
}
bookmark = &Bookmark{}
} else if isFolder {
if !slices.Contains(foldersToIgnore, t.Data) {
folders = append(folders, t.Data)
}
isFolder = false
}
}
}
}
var hrefNotFound = errors.New("href not found")
func parseHref(t html.Token) (*url.URL, error) {
for _, attr := range t.Attr {
if attr.Key == "href" {
return url.Parse(attr.Val)
}
}
return nil, hrefNotFound
}
package webbr
import (
"context"
"fmt"
"io"
"net/url"
"os"
"path/filepath"
"testing"
)
func TestParseBookmarks(t *testing.T) {
r := mustOpenSample()
defer r.Close()
expectedBookmarks := []Bookmark{
{
Name: "Google",
Url: mustParseUrl("https://www.google.fr/"),
Folders: []string{"Moteurs de recherche"},
},
{
Name: "DuckDuckGo",
Url: mustParseUrl("https://duckduckgo.com/"),
Folders: []string{"Moteurs de recherche"},
},
{
Name: "Bing",
Url: mustParseUrl("https://www.bing.com/"),
Folders: []string{"Moteurs de recherche"},
},
{
Name: "PREVISIONS METEO FRANCE",
Url: mustParseUrl("https://meteofrance.com/"),
Folders: nil,
},
{
Name: "LinuxFr.org",
Url: mustParseUrl("https://linuxfr.org/"),
Folders: []string{"Technologies"},
},
{
Name: "Developpez.com, le club des développeurs et IT Pro",
Url: mustParseUrl("https://www.developpez.com/"),
Folders: []string{"Technologies"},
},
{
Name: "Korben",
Url: mustParseUrl("https://korben.info/"),
Folders: []string{"Technologies", "Blogs"},
},
}
ctx := context.Background()
ch := ParseBookmarks(ctx, r)
for _, eb := range expectedBookmarks {
b := <-ch
if b.Name != eb.Name {
t.Errorf("expected name [%s], got [%s]", eb.Name, b.Name)
}
if b.Url.String() != eb.Url.String() {
t.Errorf("expected url [%s], got [%s]", eb.Url, b.Url)
}
for j, ef := range eb.Folders {
f := b.Folders[j]
if f != ef {
t.Errorf("expected folders[%d] [%s], got [%s]", j, ef, f)
}
}
}
_, ok := <-ch
if ok {
t.Errorf("expected channel to be closed")
}
}
func mustParseUrl(rawUrl string) *url.URL {
u, err := url.Parse(rawUrl)
if err != nil {
panic(fmt.Sprintf("failed to parse URL %s: %s", rawUrl, err))
}
return u
}
func TestParseBookmarksQuit(t *testing.T) {
r := mustOpenSample()
defer r.Close()
ctx, cancel := context.WithCancel(context.Background())
ch := ParseBookmarks(ctx, r)
<-ch
cancel()
_, ok := <-ch
if ok {
t.Errorf("expected channel to be closed")
}
}
func BenchmarkParseBookmarks(b *testing.B) {
r := mustOpenSample()
defer r.Close()
for i := 0; i < b.N; i++ {
ctx := context.Background()
ch := ParseBookmarks(ctx, r)
for _ = range ch {
}
}
}
func mustOpenSample() io.ReadCloser {
r, err := os.Open(filepath.Join("testdata", "bookmarks_sample.html"))
if err != nil {
panic(fmt.Sprintf("failed to open bookmarks sample file: %s", err))
}
return r
}
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
<DT><H3 ADD_DATE="1658779544" LAST_MODIFIED="1658779692" PERSONAL_TOOLBAR_FOLDER="true">Barre de favoris</H3>
<DL><p>
<DT><H3 ADD_DATE="1658779589" LAST_MODIFIED="1658779660">Moteurs de recherche</H3>
<DL><p>
<DT><A HREF="https://www.google.fr/" ADD_DATE="1658779603" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAACIklEQVQ4jYWSS0iUURTHf/fe8RvHooE2VlT2FNqUGWmNEYUR9lhEEVJhUIsoXOQuap1Rq6KHNQt3LaPAIOxhlNTChUwLMU3NR1CklUzg6xvPd1ro2KhTHjjcA/e8/uf/hzmmqsUiEheRLhHxp/2TiDxQ1aK5+ZmFeSJSrwuYiMRVNZKuMxnFz51zu9T3GX/6iPGmRqS/F5WAUMEawuUVRI5UYjwPEWl2zlUYY8YMgIjUW2vPBkPfSV6uYbKvJ+uW3rZSojfuABAEQdw5d96oajHQqr7P8IUqpL8X43lEjp3EK4mBtfgt75l4+4po7U3cytWZPbcyjUlTidv642ipDu7foX7bh2zgs92jDhHpUlWdbNmuEw15OvqweqE7ZjboCAEFADrSjs1LkRM7NAt3+bWRebfYudFx9XguwFqbwePs9z/mT/6NLdAHMBpex28W0/C1Y1Zy05VFM75nUwiAZVGT/v5sgdcA3UurOPUrxvXOFhJD7fOmdn4LeNc5NbpkfWimv5mWZ8KXFKdfXqInOYBnc6gsPEjZ8mKssbQOtvEkMczYl0oK8z3un4lgppbYkhZS3Fp7bnD0Jxeba+lODmTFviFcxq29NeRHDUEQ1DnnqtNSjohIo3Nutx+keNz9gmf9zfQkB0ChYMkK9q2KcaLwMJFQGFV9Y4w5YIwZzyBBI2lRLcD9PVXN/SdFqlokInUi0iEiE9P+UUTuqurmufl/AKTzsFGmvUNUAAAAAElFTkSuQmCC">Google</A>
<DT><A HREF="https://duckduckgo.com/" ADD_DATE="1658779616" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAC7UlEQVQ4jZWTX2hbdRTHP79fb+7Nwm2ytjMhic1Msa0ymViY+KArTFeU0BehG5Xrn0Ee5sOk9MHJHqQMwakgrg+KUFyVsOL2MLBkagtCLGNiuwwWN6fd2jWuXbOs02Q3yW6W3OtDMStjL56nc+B8v5zv4fsVPFBLgzu22ThxcPqEokUAnJqVBTElEWNbJ2YvbtwX/zXOyIC6+Ofipy7/o297Y4b09LyA1H0A2GaBcnqGYjJh37t5/YtoV3RYjJysNgickQH12h8L3+u79+xqfe0dCskEZmqS2s1lABR/GL23H1/M4PbxUczpEz891t3xihg5WW0CeDPkOdq8e89e38t7yX14AGs+g3/4EzZtf47KxVnqf+e5e2mOSnqGNmMIxyGamz3TejSzclosDe7YJgPhC6EjE/LG+/sQqkbg4ChNunf9/KpFfvQQ5XMpANRIJ8HDx1h5b9C2c8vbpY0T98YMWUgmqGbn2TywvwEGkKpG2753oUkBoJqdp5BM4I0Z0saJS3D6PD07MVOT63pbtgDw1sFvGiTxj39Ee/ypxmymJvH07AScPikULSJ1b+NhtVurAIx/9EYDMP7Bi7jbb9HyUo6AsYRTWkLqXoSiReSDPrAWL28YLlPPdHLv/PNsCp3BvqOQPxWmXlLuS3RqVtY2iyj+8Drm6kafCJzyGmvfadyeDlAzXaiPWCj+MLZZwKlZWQliqpz+Gb23f/1JVy/dx2vdLI8/Aw64I2VsS2Bd96D39lNOzwBiSkrEWDGZsH0xAzXSSb2wRm1ttcGhhjqpLOgUfm2lcqUZV6gLX8ygmEzYEjHW9NlvK/kDHbrfLpnPthlD3P39PK5AO6etFH+ZC6w21zinZcl0uXm6vhX/0BH+OfUVlcwvn0cn5r5WAKJd0eFr0yeeBHYFDx/Dmr9ArrxMOv8tLqHS0q6z2b2F4OtfNqwc7e4YhrmHh8n3alzeeCJIqx6kzR3ALpconv2BUvL4w8O0sf5vnP8FS/5CE3dr7UgAAAAASUVORK5CYII=">DuckDuckGo</A>
<DT><A HREF="https://www.bing.com/" ADD_DATE="1658779638" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAABrklEQVQ4jc2Sv0tbURTHv+fea9575of1L3DoJGLb2ckfi/EHuGQUArUZCi/QTu0grbTibKBLClooOOhQcIiDJNEO4tAllAwOBqSLOoh5fSaa5N3TIUlNQkg7dOh3O5fz/dzvORzg3+utAJjua6b629/p3hjdNGGnjHZQL0ND/tj+qOxX70jr+XqH+lIruys3ydnvPQBMALHv+eGwZchj6JrBrJMA1wTJZ0xKl6uVscqHyXyztwlom82U1TUSsoJyacRJTMadxNRL/uk+JiHIlLzWLXQDQIzYtz6SvrC+cz8XkzOniOR9CKeM4sZcQVduPhGpaYRTBgDuAgDw41KAmQAoAIB7RggM1ZsZHgCBgNtmbs1DADgUT+8KZY3z9dWT4sZcAQAG7PRD8vV/1dVSxklMLXbuQDWWCIBw69FrS/ARBQO5UDz9UbDoY/BTMgcsuiudAwAiOwI78DoT/JZ/ae+R8ltvwN4CIASDt6D5QvofvPBunWVnfeJ9p6dFLcdipwxEs2azDMUPlgdf5ThoZ1a7fdyiLqcc2ZYAELQzqyE7c4LxrOoB+EO6Buz/0S85h5rvF5Z4ogAAAABJRU5ErkJggg==">Bing</A>
</DL><p>
<DT><A HREF="https://meteofrance.com/" ADD_DATE="1658779660" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAACqElEQVQ4jUWTP2idZRjFf8/zvvf9cmNMSq4OLdEpSihUhKggFMygXUpxCogZumhFtEMm6SBcJ+kgHTJoEV10qGRUKXSQuiit3KFD2yEKhV6iNN4018pNvu/98zjc3nimMzwcDs85R5jg+Go45O3Xjf0/BW6Pee+9yOL5ivCXHd7c2WwABIC1Ly6gukaOGVTADCziWk+S4veITqNuhRJrQHHBYfYN3567KCyer3hl6Xd8tUBqAAMfwLUgZ0j1RUTX8VXAEqQGfAWxuc9vd5/zLHQEsxG5KeQU8aEixds0zVdgv5A14ZNSx2VQh9PXSAcFZMRCR5T+wBBRSlF8qLDyHaP6JFc+uMSVD29Q+XWqJ95C5Rpbt96g2HVcJQjQH9jEQcFXkGOPh3fPcnWjZqXrAU+Oy/jqGdAXeP7EOiU1EMa/W+iIAiBiOA+pbHB1o2b5XIvr3cRsZx7LxxgNDXVvo+4C6q6BZQylPW9jATNPPYKYewCcuZwBqOwIogFFiAcJkxkwI9V7iFUAEwcCBqoJgO4n46yTKfY4ahFBxFEsYKVM6qC05w2zRKsNjiUAlrfd2Fn5F0oCMUCw8g/e7eGn5ig0Y4H9XUHExvnr+wD0jmVWVx33hw9AHyAitOcU+JJsL+FDQCSxvyvKvXsAjngA4k6xdvlT6BY2NzO97gjX2qKaecTB6DPgIc6dpRkZiKM/MGGlO8XRp24hskjOCR8CJf+IyecM924yd2QJ718mxyVE36E0GfUKbLG98+LjIjGNC4q6gGUI7dOE1g90On8gnMTsI6Zm3oUiiPO4oBjT9Afm2L2ZOfHmLJRnMdtBZJcSd8jxb5wHK9tAH8uzlLwDDBAZgvuanz/+SQ7n+ep6m2ru/7lO8PTxwuadxBmmeMQ4vnoo/HppH+A/d4cqqIcGJosAAAAASUVORK5CYII=">PREVISIONS METEO FRANCE</A>
<DT><H3 ADD_DATE="1658779682" LAST_MODIFIED="1658779760">Technologies</H3>
<DL><p>
<DT><A HREF="https://linuxfr.org/" ADD_DATE="1658779692" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAIAAACQkWg2AAACs0lEQVQ4jQXB224TVxQA0H3O7Ll4bnZiJ46dOE4c10BUUslpEFXFC7zwIUj8BS98D28IhKCildJWqgJBSgShjWtDMMQ4xDO+zZw5N9Yi8UtHihDpxIKEEEoMUFJJjmiQNFWMS6oBAKXC0SUcdxABBBCFiEoHWlGTCp0lcaRfvTWPO24Ua6VkkHeqpfRKjW1vpmT60tLSlioD5UueokFPu/rZn7R/sea5DTvvaaJ6nY4BsBAm1aVzdEyuqcxSGcfsMiJng9yTv+3xvNFu/VhpVCutDUQLc/Y8mrz7580o+oKzWXD5TR++hcFoKRNWHE0LqNcKs2jc3Wn8XK5v+IvlymYTslmlXvnryWPM5tnBkfN51MrZ4fzzR1fJrYpsNO2vTHdf78efRplUxZWFH3a31za3dm/dwZSTzpl93BtNBu9yINobjkOFijvd98HgE1ttBYu53PDw5PmLP1p3f21dv46Sq8BLQnvc3DW1tIcJfdEzsOdE5+TGSnFv70ZhdVUI9uH90e+PfpsOu8bDe6ReUTd36E5Lb9flVo37Po1TPRaEJSoEXF5bITlzqbZesrz950+NB/fDwJtZrul6RpC3Fop8tZStL3EFshtn3qJbLYSOWxCaB/lcv3eChp6iBZYWUnPOgQoDQSyG2G5KzuYfWPcsq7mR75XX2Xwcz4ZUKz1nvlC24JZmQDV13MAN7eUSNKtET877p/8rIinhpycH/UkXpxcWIzYQagAzqMdUooUFihAQXCOb8GHSgz0t0mn/4r9m+yekJjgyRgoKKAFiECBkJhQdxvTjV1Ku1dtXd/2CFX87JwWzWMwjGKZDIZMIFA1ImfQB0ukUXp2osHj19i93SlaZSjAt08my49dHFEEpDVprKUWmUiXGyUQd/ksuRwvX3Ct+GkguOZXBcrBSrKSDyXfWCXZTpcfirwAAAABJRU5ErkJggg==">LinuxFr.org</A>
<DT><A HREF="https://www.developpez.com/" ADD_DATE="1658779745" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAABYUlEQVQ4jZWRTSiDARjHf+8+bLLFfI/GwgEnH+UrIaU2TSRc2E2EOdnJRw6ipBxWbg6sVtJEuSiHubgNNyVCws1nDg68j4scRHvffz2Xp+f////qgR/1GrHMFee3bVegQ8rvRVb5pi3X8dB59f5hDx5Nbc7ymgY8KvCsOVXALa6si5fSAvHTHJMOMgUMegLuBETG7D6s4/OYF2Kazd8B8j1egLSimX0Ibv11+x9WXEDExDCAzfw5kZ6ndsNqjVaCyo8SRG1CBJoBUl3LAsMhTQQKnBifGFN8tbDgPTiFureMbMCUoonghyQnqVOO9yTS0iqeuRuBkS5dAQBxMEc3DqWnPyxqvfVWbIwKOAWSBQwJf7s1u7uW7HQRjexUK/dKn1qACRgAShO2V3nWlxbDZwL+Rt3omCaj9rKVayjP1ekcLYHBSxgK6LKNh84tDvf0CIZAUGcjVA/FzQ3tkULdRuALJntu31c3rVAAAAAASUVORK5CYII=">Developpez.com, le club des développeurs et IT Pro</A>
<DT><H3 ADD_DATE="1658779714" LAST_MODIFIED="1658779745">Blogs</H3>
<DL><p>
<DT><A HREF="https://korben.info/" ADD_DATE="1658779721" ICON="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAABj0lEQVQ4jX3TPWgUURQF4G/XJQYhRo2FCGJAAhpQCy1SiFbaGLHQWrDQzkKEYCtibynaiJ1ExVRGkBDFStHCwsKfCBIwiL+gkGSTrMWccZ8pvHB5d+a+c96578xQRUM3juIBZvEjOZt3J4p9JQasw228wRgGi952XMQrTGB9ydJED6bxGSfRxnEM4xfu4yN6cRO7MZIeuI6HqfdFcqfI3zhbKBrHnfphf2T3R9FEQPNYSHaialf2rA3mWBPnI/FnRhnEUuqejDifw45kXcAtnGtib+Q3sBL2VkArqXuzbogamMTOFvrwLo2+nHoGT7EJBzCKKbzEFsxhpnbjPXaE9TAWo6qOgUI6XXs34lsL31V2zeC16sOZUtn1CaewGRcy+2TGHUrfDVwL6zDe+tfCMkcLJZdxD/aofO/HJV0LF4ts467KEbmrDzhUs13BczxTWbiscqBev2JrcfrjQvVf1vFCajvg5dRLkTuEJ3gUXNOqGMOX/9zBHK6WgMaquqOy9DQOYhvWBDit+ltflAR/ADKTeJvWOeGmAAAAAElFTkSuQmCC">Korben</A>
</DL><p>
</DL><p>
</DL><p>
</DL><p>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment