Last active
August 18, 2023 09:17
-
-
Save etissieres/3f71ab17043409e9ab14b088ea2d7b01 to your computer and use it in GitHub Desktop.
Browser HTML bookmarks file parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package webbr | |
import ( | |
"context" | |
"errors" | |
"io" | |
"net/url" | |
"regexp" | |
"slices" | |
"golang.org/x/net/html" | |
) | |
type Bookmark struct { | |
Name string | |
Url *url.URL | |
Folders []string | |
} | |
var ( | |
spaces = regexp.MustCompile(`\s+`) | |
foldersToIgnore = []string{ | |
"Barre de favoris", | |
"Autres favoris", | |
"Favoris sur mobile", | |
"Bookmarks bar", | |
"Other bookmarks", | |
"Mobile bookmarks", | |
} | |
) | |
func ParseBookmarks(ctx context.Context, r io.Reader) <-chan *Bookmark { | |
ch := make(chan *Bookmark) | |
go func() { | |
parseBookmarks(ctx, r, ch) | |
close(ch) | |
}() | |
return ch | |
} | |
func parseBookmarks(ctx context.Context, r io.Reader, ch chan *Bookmark) { | |
var ( | |
isLink bool | |
isFolder bool | |
bookmark *Bookmark = &Bookmark{} | |
folders []string | |
) | |
tzr := html.NewTokenizer(r) | |
for { | |
ttype := tzr.Next() | |
switch ttype { | |
case html.ErrorToken: | |
return | |
case html.StartTagToken: | |
t := tzr.Token() | |
isLink = t.Data == "a" | |
if isLink { | |
href, err := parseHref(t) | |
if err != nil { | |
isLink = false | |
} else { | |
bookmark.Url = href | |
bookmark.Folders = make([]string, len(folders)) | |
copy(bookmark.Folders, folders) | |
} | |
} else if t.Data == "h3" { | |
isFolder = true | |
} | |
case html.EndTagToken: | |
t := tzr.Token() | |
if len(folders) > 0 && t.Data == "dl" { | |
folders = folders[:len(folders)-1] | |
} | |
case html.TextToken: | |
t := tzr.Token() | |
if isLink { | |
bookmark.Name = spaces.ReplaceAllString(t.Data, " ") | |
isLink, isFolder = false, false | |
select { | |
case <-ctx.Done(): | |
return | |
case ch <- bookmark: | |
} | |
bookmark = &Bookmark{} | |
} else if isFolder { | |
if !slices.Contains(foldersToIgnore, t.Data) { | |
folders = append(folders, t.Data) | |
} | |
isFolder = false | |
} | |
} | |
} | |
} | |
var hrefNotFound = errors.New("href not found") | |
func parseHref(t html.Token) (*url.URL, error) { | |
for _, attr := range t.Attr { | |
if attr.Key == "href" { | |
return url.Parse(attr.Val) | |
} | |
} | |
return nil, hrefNotFound | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package webbr | |
import ( | |
"context" | |
"fmt" | |
"io" | |
"net/url" | |
"os" | |
"path/filepath" | |
"testing" | |
) | |
func TestParseBookmarks(t *testing.T) { | |
r := mustOpenSample() | |
defer r.Close() | |
expectedBookmarks := []Bookmark{ | |
{ | |
Name: "Google", | |
Url: mustParseUrl("https://www.google.fr/"), | |
Folders: []string{"Moteurs de recherche"}, | |
}, | |
{ | |
Name: "DuckDuckGo", | |
Url: mustParseUrl("https://duckduckgo.com/"), | |
Folders: []string{"Moteurs de recherche"}, | |
}, | |
{ | |
Name: "Bing", | |
Url: mustParseUrl("https://www.bing.com/"), | |
Folders: []string{"Moteurs de recherche"}, | |
}, | |
{ | |
Name: "PREVISIONS METEO FRANCE", | |
Url: mustParseUrl("https://meteofrance.com/"), | |
Folders: nil, | |
}, | |
{ | |
Name: "LinuxFr.org", | |
Url: mustParseUrl("https://linuxfr.org/"), | |
Folders: []string{"Technologies"}, | |
}, | |
{ | |
Name: "Developpez.com, le club des développeurs et IT Pro", | |
Url: mustParseUrl("https://www.developpez.com/"), | |
Folders: []string{"Technologies"}, | |
}, | |
{ | |
Name: "Korben", | |
Url: mustParseUrl("https://korben.info/"), | |
Folders: []string{"Technologies", "Blogs"}, | |
}, | |
} | |
ctx := context.Background() | |
ch := ParseBookmarks(ctx, r) | |
for _, eb := range expectedBookmarks { | |
b := <-ch | |
if b.Name != eb.Name { | |
t.Errorf("expected name [%s], got [%s]", eb.Name, b.Name) | |
} | |
if b.Url.String() != eb.Url.String() { | |
t.Errorf("expected url [%s], got [%s]", eb.Url, b.Url) | |
} | |
for j, ef := range eb.Folders { | |
f := b.Folders[j] | |
if f != ef { | |
t.Errorf("expected folders[%d] [%s], got [%s]", j, ef, f) | |
} | |
} | |
} | |
_, ok := <-ch | |
if ok { | |
t.Errorf("expected channel to be closed") | |
} | |
} | |
func mustParseUrl(rawUrl string) *url.URL { | |
u, err := url.Parse(rawUrl) | |
if err != nil { | |
panic(fmt.Sprintf("failed to parse URL %s: %s", rawUrl, err)) | |
} | |
return u | |
} | |
func TestParseBookmarksQuit(t *testing.T) { | |
r := mustOpenSample() | |
defer r.Close() | |
ctx, cancel := context.WithCancel(context.Background()) | |
ch := ParseBookmarks(ctx, r) | |
<-ch | |
cancel() | |
_, ok := <-ch | |
if ok { | |
t.Errorf("expected channel to be closed") | |
} | |
} | |
func BenchmarkParseBookmarks(b *testing.B) { | |
r := mustOpenSample() | |
defer r.Close() | |
for i := 0; i < b.N; i++ { | |
ctx := context.Background() | |
ch := ParseBookmarks(ctx, r) | |
for _ = range ch { | |
} | |
} | |
} | |
func mustOpenSample() io.ReadCloser { | |
r, err := os.Open(filepath.Join("testdata", "bookmarks_sample.html")) | |
if err != nil { | |
panic(fmt.Sprintf("failed to open bookmarks sample file: %s", err)) | |
} | |
return r | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE NETSCAPE-Bookmark-file-1> | |
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8"> | |
<TITLE>Bookmarks</TITLE> | |
<H1>Bookmarks</H1> | |
<DL><p> | |
<DT><H3 ADD_DATE="1658779544" LAST_MODIFIED="1658779692" PERSONAL_TOOLBAR_FOLDER="true">Barre de favoris</H3> | |
<DL><p> | |
<DT><H3 ADD_DATE="1658779589" LAST_MODIFIED="1658779660">Moteurs de recherche</H3> | |
<DL><p> | |
<DT><A HREF="https://www.google.fr/" ADD_DATE="1658779603" ICON="">Google</A> | |
<DT><A HREF="https://duckduckgo.com/" ADD_DATE="1658779616" ICON="">DuckDuckGo</A> | |
<DT><A HREF="https://www.bing.com/" ADD_DATE="1658779638" ICON="">Bing</A> | |
</DL><p> | |
<DT><A HREF="https://meteofrance.com/" ADD_DATE="1658779660" ICON="">PREVISIONS METEO FRANCE</A> | |
<DT><H3 ADD_DATE="1658779682" LAST_MODIFIED="1658779760">Technologies</H3> | |
<DL><p> | |
<DT><A HREF="https://linuxfr.org/" ADD_DATE="1658779692" ICON="">LinuxFr.org</A> | |
<DT><A HREF="https://www.developpez.com/" ADD_DATE="1658779745" ICON="">Developpez.com, le club des développeurs et IT Pro</A> | |
<DT><H3 ADD_DATE="1658779714" LAST_MODIFIED="1658779745">Blogs</H3> | |
<DL><p> | |
<DT><A HREF="https://korben.info/" ADD_DATE="1658779721" ICON="">Korben</A> | |
</DL><p> | |
</DL><p> | |
</DL><p> | |
</DL><p> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment