forked from sourcegraph/sourcegraph-public-snapshot
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsplit.go
More file actions
88 lines (75 loc) · 2.28 KB
/
Copy pathsplit.go
File metadata and controls
88 lines (75 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
package context
import (
"math"
"strings"
)
var splittableLinePrefixes = []string{
"//",
"#",
"/*",
"func",
"var",
"const",
"fn",
"public",
"private",
"type",
}
func isSplittableLine(line string) bool {
trimmedLine := strings.TrimSpace(line)
if len(trimmedLine) == 0 {
return true
}
for _, prefix := range splittableLinePrefixes {
if strings.HasPrefix(line, prefix) {
return true
}
}
return false
}
type SplitOptions struct {
NoSplitTokensThreshold int
ChunkTokensThreshold int
ChunkEarlySplitTokensThreshold int
}
type EmbeddableChunk struct {
FileName string
StartLine int
EndLine int
Content string
}
const CHARS_PER_TOKEN = 4
func EstimateTokens(text string) int {
return int(math.Ceil(float64(len(text)) / float64(CHARS_PER_TOKEN)))
}
// SplitIntoEmbeddableChunks splits the given text into embeddable chunks.
//
// The text is split on newline characters into lines. The lines are then grouped into chunks based on the split options.
// When the token sum of lines in a chunk exceeds the chunk token threshold or an early split token threshold is met
// and the current line is splittable (empty line, or starts with a comment or declaration), a chunk is ended and added to the results.
func SplitIntoEmbeddableChunks(text string, fileName string, splitOptions SplitOptions) []EmbeddableChunk {
// If the text is short enough, embed the entire file rather than splitting it into chunks.
if EstimateTokens(text) < splitOptions.NoSplitTokensThreshold {
return []EmbeddableChunk{{FileName: fileName, StartLine: 0, EndLine: strings.Count(text, "\n") + 1, Content: text}}
}
chunks := []EmbeddableChunk{}
startLine, tokensSum := 0, 0
lines := strings.Split(text, "\n")
addChunk := func(endLine int) {
content := strings.Join(lines[startLine:endLine], "\n")
if len(content) > 0 {
chunks = append(chunks, EmbeddableChunk{FileName: fileName, StartLine: startLine, EndLine: endLine, Content: content})
}
startLine, tokensSum = endLine, 0
}
for i := 0; i < len(lines); i++ {
if tokensSum > splitOptions.ChunkTokensThreshold || (tokensSum > splitOptions.ChunkEarlySplitTokensThreshold && isSplittableLine(lines[i])) {
addChunk(i)
}
tokensSum += EstimateTokens(lines[i])
}
if tokensSum > 0 {
addChunk(len(lines))
}
return chunks
}