Skip to content

Instantly share code, notes, and snippets.

@etissieres
Last active August 18, 2023 09:17
Show Gist options
  • Save etissieres/3f71ab17043409e9ab14b088ea2d7b01 to your computer and use it in GitHub Desktop.
Save etissieres/3f71ab17043409e9ab14b088ea2d7b01 to your computer and use it in GitHub Desktop.
Browser HTML bookmarks file parsing
package webbr
import (
"context"
"errors"
"io"
"net/url"
"regexp"
"slices"
"golang.org/x/net/html"
)
type Bookmark struct {
Name string
Url *url.URL
Folders []string
}
var (
spaces = regexp.MustCompile(`\s+`)
foldersToIgnore = []string{
"Barre de favoris",
"Autres favoris",
"Favoris sur mobile",
"Bookmarks bar",
"Other bookmarks",
"Mobile bookmarks",
}
)
func ParseBookmarks(ctx context.Context, r io.Reader) <-chan *Bookmark {
ch := make(chan *Bookmark)
go func() {
parseBookmarks(ctx, r, ch)
close(ch)
}()
return ch
}
func parseBookmarks(ctx context.Context, r io.Reader, ch chan *Bookmark) {
var (
isLink bool
isFolder bool
bookmark *Bookmark = &Bookmark{}
folders []string
)
tzr := html.NewTokenizer(r)
for {
ttype := tzr.Next()
switch ttype {
case html.ErrorToken:
return
case html.StartTagToken:
t := tzr.Token()
isLink = t.Data == "a"
if isLink {
href, err := parseHref(t)
if err != nil {
isLink = false
} else {
bookmark.Url = href
bookmark.Folders = make([]string, len(folders))
copy(bookmark.Folders, folders)
}
} else if t.Data == "h3" {
isFolder = true
}
case html.EndTagToken:
t := tzr.Token()
if len(folders) > 0 && t.Data == "dl" {
folders = folders[:len(folders)-1]
}
case html.TextToken:
t := tzr.Token()
if isLink {
bookmark.Name = spaces.ReplaceAllString(t.Data, " ")
isLink, isFolder = false, false
select {
case <-ctx.Done():
return
case ch <- bookmark:
}
bookmark = &Bookmark{}
} else if isFolder {
if !slices.Contains(foldersToIgnore, t.Data) {
folders = append(folders, t.Data)
}
isFolder = false
}
}
}
}
var hrefNotFound = errors.New("href not found")
func parseHref(t html.Token) (*url.URL, error) {
for _, attr := range t.Attr {
if attr.Key == "href" {
return url.Parse(attr.Val)
}
}
return nil, hrefNotFound
}
package webbr
import (
"context"
"fmt"
"io"
"net/url"
"os"
"path/filepath"
"testing"
)
func TestParseBookmarks(t *testing.T) {
r := mustOpenSample()
defer r.Close()
expectedBookmarks := []Bookmark{
{
Name: "Google",
Url: mustParseUrl("https://www.google.fr/"),
Folders: []string{"Moteurs de recherche"},
},
{
Name: "DuckDuckGo",
Url: mustParseUrl("https://duckduckgo.com/"),
Folders: []string{"Moteurs de recherche"},
},
{
Name: "Bing",
Url: mustParseUrl("https://www.bing.com/"),
Folders: []string{"Moteurs de recherche"},
},
{
Name: "PREVISIONS METEO FRANCE",
Url: mustParseUrl("https://meteofrance.com/"),
Folders: nil,
},
{
Name: "LinuxFr.org",
Url: mustParseUrl("https://linuxfr.org/"),
Folders: []string{"Technologies"},
},
{
Name: "Developpez.com, le club des développeurs et IT Pro",
Url: mustParseUrl("https://www.developpez.com/"),
Folders: []string{"Technologies"},
},
{
Name: "Korben",
Url: mustParseUrl("https://korben.info/"),
Folders: []string{"Technologies", "Blogs"},
},
}
ctx := context.Background()
ch := ParseBookmarks(ctx, r)
for _, eb := range expectedBookmarks {
b := <-ch
if b.Name != eb.Name {
t.Errorf("expected name [%s], got [%s]", eb.Name, b.Name)
}
if b.Url.String() != eb.Url.String() {
t.Errorf("expected url [%s], got [%s]", eb.Url, b.Url)
}
for j, ef := range eb.Folders {
f := b.Folders[j]
if f != ef {
t.Errorf("expected folders[%d] [%s], got [%s]", j, ef, f)
}
}
}
_, ok := <-ch
if ok {
t.Errorf("expected channel to be closed")
}
}
func mustParseUrl(rawUrl string) *url.URL {
u, err := url.Parse(rawUrl)
if err != nil {
panic(fmt.Sprintf("failed to parse URL %s: %s", rawUrl, err))
}
return u
}
func TestParseBookmarksQuit(t *testing.T) {
r := mustOpenSample()
defer r.Close()
ctx, cancel := context.WithCancel(context.Background())
ch := ParseBookmarks(ctx, r)
<-ch
cancel()
_, ok := <-ch
if ok {
t.Errorf("expected channel to be closed")
}
}
func BenchmarkParseBookmarks(b *testing.B) {
r := mustOpenSample()
defer r.Close()
for i := 0; i < b.N; i++ {
ctx := context.Background()
ch := ParseBookmarks(ctx, r)
for _ = range ch {
}
}
}
func mustOpenSample() io.ReadCloser {
r, err := os.Open(filepath.Join("testdata", "bookmarks_sample.html"))
if err != nil {
panic(fmt.Sprintf("failed to open bookmarks sample file: %s", err))
}
return r
}
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
<DT><H3 ADD_DATE="1658779544" LAST_MODIFIED="1658779692" PERSONAL_TOOLBAR_FOLDER="true">Barre de favoris</H3>
<DL><p>
<DT><H3 ADD_DATE="1658779589" LAST_MODIFIED="1658779660">Moteurs de recherche</H3>
<DL><p>
<DT><A HREF="https://www.google.fr/" ADD_DATE="1658779603" ICON="">Google</A>
<DT><A HREF="https://duckduckgo.com/" ADD_DATE="1658779616" ICON="">DuckDuckGo</A>
<DT><A HREF="https://www.bing.com/" ADD_DATE="1658779638" ICON="">Bing</A>
</DL><p>
<DT><A HREF="https://meteofrance.com/" ADD_DATE="1658779660" ICON="">PREVISIONS METEO FRANCE</A>
<DT><H3 ADD_DATE="1658779682" LAST_MODIFIED="1658779760">Technologies</H3>
<DL><p>
<DT><A HREF="https://linuxfr.org/" ADD_DATE="1658779692" ICON="">LinuxFr.org</A>
<DT><A HREF="https://www.developpez.com/" ADD_DATE="1658779745" ICON="">Developpez.com, le club des développeurs et IT Pro</A>
<DT><H3 ADD_DATE="1658779714" LAST_MODIFIED="1658779745">Blogs</H3>
<DL><p>
<DT><A HREF="https://korben.info/" ADD_DATE="1658779721" ICON="">Korben</A>
</DL><p>
</DL><p>
</DL><p>
</DL><p>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment