Last active
October 14, 2025 16:17
-
-
Save ythosa/3647493c08ac05124a5647d3535da613 to your computer and use it in GitHub Desktop.
Effective way to find `hashtags` in raw text in Go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| func ParseHashtags(input string) string { | |
| const minHashtagLength = 3 | |
| var foundHashtags []string | |
| searchHashtag: | |
| for { | |
| hashtagIndex := strings.Index(input, "#") | |
| if hashtagIndex < 0 { | |
| break // no hashtags found | |
| } | |
| // if string is not starts from #, we should trim allowed chars for prefix | |
| if hashtagIndex > 0 { | |
| lastSpace := max(0, strings.LastIndexFunc(input[:hashtagIndex], unicode.IsSpace)) | |
| for _, r := range input[lastSpace:hashtagIndex] { | |
| if unicode.IsLetter(r) || unicode.IsDigit(r) { | |
| input = input[hashtagIndex+1:] // found hashtag is invalid | |
| continue searchHashtag | |
| } | |
| } | |
| } | |
| // make the '#' as first character! | |
| input = input[hashtagIndex:] | |
| // squash sequence of '#' characters | |
| hashtagsCount := 0 | |
| for _, r := range input { | |
| if r != '#' { | |
| break | |
| } | |
| hashtagsCount++ | |
| } | |
| input = input[hashtagsCount-1:] | |
| // validate hashtag body (we can drop first byte because of it is '#') | |
| bodyEnds := 0 | |
| for i, r := range input[1:] { | |
| // first char after '#' must be letter | |
| if i == 0 && !unicode.IsLetter(r) { | |
| input = input[1:] | |
| continue searchHashtag | |
| } | |
| if unicode.IsLetter(r) || unicode.IsDigit(r) { | |
| bodyEnds = i + utf8.RuneLen(r) // hashtag must ends on letter or digit | |
| } else if r != '_' && r != '.' && r != '-' { | |
| break | |
| } | |
| } | |
| if bodyEnds >= minHashtagLength { | |
| foundHashtags = append(foundHashtags, input[:bodyEnds+1]) | |
| } | |
| input = input[bodyEnds+1:] | |
| } | |
| return strings.Join(foundHashtags, " ") | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment