Created
July 7, 2022 21:05
-
-
Save scottcagno/6d55f877f4e46bdf1d545fb17bced091 to your computer and use it in GitHub Desktop.
File Utilities -- Scan, Index
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // IndexSpans can be used to index spans of text based around a delimiter of your | |
| // choice. The size argument allows you to tune it a bit and have some control over | |
| // the overhead used by the function. The delimiter is not included in the returned | |
| // span bound set and empty lines are not ignored. | |
| func IndexSpans(r io.Reader, delim byte, size int) ([]Span, error) { | |
| // Setup initial variables for the function | |
| var id, beg, end int | |
| // Drop is a helper func used to drop the correct number of bytes. Right now it | |
| // is mostly used to handle the special case of \n and \r\n | |
| drop := func(p []byte, c byte) int { | |
| if c == '\n' { | |
| if len(p) > 1 && p[len(p)-2] == '\r' { | |
| return 2 | |
| } | |
| } | |
| return 1 | |
| } | |
| // Initialize our spans | |
| spans := make([]Span, 0, 8) | |
| // Get a new buffered reader set to our determined buffer size. | |
| br := bufio.NewReaderSize(r, size) | |
| for { | |
| // Read up to buffer size length of data and look for the delimiter. If we fill | |
| // up the buffer and do not find the delimiter we are looking for, we will just | |
| // keep reading, one buffer length at a time, until we find it. Note: ReadSlice | |
| // attempts to re-use the same buffer internally, so that helps a lot. | |
| data, err := br.ReadSlice(delim) | |
| if err != nil { | |
| if err == io.EOF { | |
| // We have reached the end--we are going to check for any remaining data. | |
| if len(data) > 0 { | |
| // We have some leftover data, which means the stream was not delimiter | |
| // terminated. Add the remaining data to one last span before breaking. | |
| spans = append(spans, Span{id + 1, beg, end + len(data)}) | |
| } | |
| // Otherwise, the stream is indeed delimiter terminated, so we can just break. | |
| break | |
| } | |
| if err == bufio.ErrBufferFull { | |
| // Our buffer seems to be full, so at this point we will simply update the | |
| // ending offset and continue reading (skipping all the stuff below, and | |
| // restarting the loop from the next iteration.) | |
| end += len(data) | |
| continue | |
| } | |
| // Uh oh, we have some other issue going on. | |
| return nil, err | |
| } | |
| // We were able to locate a delimiter without filling the buffer, so we should update | |
| // our ending offset; then add our span data to our set. | |
| end += len(data) | |
| // Calculate number of bytes to drop | |
| n := drop(data, delim) | |
| // Add our span to our set and adjust the beginning, ending and id variables | |
| spans = append(spans, Span{id, beg, end - n}) | |
| // We will grow the beginning offset up to where the end is, and increment the id. | |
| beg = end | |
| id++ | |
| } | |
| return spans, nil | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment