Skip to content

Commit d49da26

Browse files
authored
feat(storage/dataflux): add range_splitter #10748 (#10899)
1 parent 7daa1bd commit d49da26

4 files changed

Lines changed: 677 additions & 8 deletions

File tree

storage/dataflux/fast_list.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ type ListerInput struct {
4949
BatchSize int
5050

5151
// Query is the query to filter objects for listing. Default value is nil. Optional.
52-
//Use ProjectionNoACL For faster listing. ACL is expensive and this results in fewer objects
53-
// to be returned from GCS in each API call.
52+
// Use ProjectionNoACL for faster listing. Including ACLs increases latency while fetching objects.
5453
Query storage.Query
5554

5655
// SkipDirectoryObjects is to indicate whether to list directory objects. Default value is false. Optional.

storage/dataflux/integration_test.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ var (
4949

5050
func TestMain(m *testing.M) {
5151
flag.Parse()
52-
fmt.Println("creating bucket")
5352
if err := httpTestBucket.Create(testPrefix); err != nil {
5453
log.Fatalf("test bucket creation failed: %v", err)
5554
}

storage/dataflux/range_splitter.go

Lines changed: 330 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,16 @@
1515
package dataflux
1616

1717
import (
18+
"fmt"
19+
"math/big"
20+
"sort"
1821
"sync"
1922
)
2023

2124
// rangeSplitter specifies the a list and a map of sorted alphabets.
2225
type rangeSplitter struct {
2326
mu sync.Mutex
24-
sortedAlphabet *[]rune
27+
sortedAlphabet []rune
2528
alphabetMap map[rune]int
2629
}
2730

@@ -31,12 +34,334 @@ type listRange struct {
3134
endRange string
3235
}
3336

37+
// minimalIntRange specifies start and end range in base-10 form, along with the
38+
// minimal string length for the split range strings.
39+
type minimalIntRange struct {
40+
startInteger *big.Int
41+
endInteger *big.Int
42+
minimalLength int
43+
}
44+
45+
// generateSplitsOpts specifies the parameters needed to generate the split
46+
// range strings.
47+
type generateSplitsOpts struct {
48+
minimalIntRange *minimalIntRange
49+
numSplits int
50+
startRange string
51+
endRange string
52+
}
53+
3454
// newRangeSplitter creates a new RangeSplitter with the given alphabets.
35-
func newRangeSplitter(alphabet string) *rangeSplitter {
36-
return &rangeSplitter{}
55+
// RangeSplitter determines split points within a given range based on the given
56+
// alphabets.
57+
func newRangeSplitter(alphabet string) (*rangeSplitter, error) {
58+
59+
// Validate that we do not have empty alphabet passed in.
60+
if len(alphabet) == 0 {
61+
return nil, fmt.Errorf("no alphabet specified for the range splitter")
62+
}
63+
// Sort the alphabet lexicographically and store a mapping of each alphabet
64+
// to its index. We need a mapping for efficient index lookup in later operations.
65+
sortedAlphabet := []rune(alphabet)
66+
sortAlphabet(sortedAlphabet)
67+
alphabetMap := constructAlphabetMap(sortedAlphabet)
68+
69+
return &rangeSplitter{
70+
alphabetMap: alphabetMap,
71+
sortedAlphabet: sortedAlphabet,
72+
}, nil
3773
}
3874

39-
// splitRange creates a given number of splits based on a provided start and end range.
75+
// splitRange divides the provided start and end range into approximately equal
76+
// subranges, returning the split points. An empty slice is returned if suitable
77+
// splits cannot be determined. Please note that this method provides a rough
78+
// estimate of split points, without ensuring precise even partitioning of the range.
79+
// Additionally, the number of found splits might be fewer than requested if the
80+
// algorithm struggles to find sufficient split points. If the start range is empty
81+
// the algorithm assumes it to be sequence of smallest possible character and empty
82+
// end range as sequence of highest possible characters.
83+
//
84+
// For example, sorted alphabet = {"a","b","c","d"}
85+
// Input: startRange= "d", endRange= "", numSplits=2
86+
//
87+
// This will be converted from base-N to base-10 integers.
88+
// While calculating base-10 integer, "a" will be appended to startRange
89+
// and "d" will be appended to endRange until the difference between integers is
90+
// more than number of splits.
91+
// startInteger for "da" = 12, endInteger for "dd" = 15
92+
//
93+
// The splits points will be 13 and 14 in base-10. This will be converted back to
94+
// base-N value and returned as split points:
95+
// {"db","dc"}
96+
4097
func (rs *rangeSplitter) splitRange(startRange, endRange string, numSplits int) ([]string, error) {
41-
return nil, nil
98+
// Number of splits has to be at least one, otherwise it is not splittable.
99+
if numSplits < 1 {
100+
return nil, fmt.Errorf("number of splits should be at least 1, got %d", numSplits)
101+
}
102+
103+
// End range (if specified) has to be lexicographically greater than the start range
104+
// for the range to be valid.
105+
if len(endRange) != 0 && startRange >= endRange {
106+
return nil, fmt.Errorf("start range %q cannot be lexicographically greater than end range %q", startRange, endRange)
107+
}
108+
109+
rs.addCharsToAlphabet([]rune(startRange))
110+
rs.addCharsToAlphabet([]rune(endRange))
111+
112+
// Validate start range characters and convert into character array form.
113+
startRangeCharArray, err := rs.convertRangeStringToArray(startRange)
114+
if err != nil {
115+
return nil, fmt.Errorf("unable to convert start range %q to array: %v", startRange, err)
116+
}
117+
118+
// Validate end range characters and convert into character array form.
119+
endRangeCharArray, err := rs.convertRangeStringToArray(endRange)
120+
if err != nil {
121+
return nil, fmt.Errorf("unable to convert end range %q to array: %v", endRange, err)
122+
}
123+
124+
// Construct the final split ranges to be returned.
125+
var splitPoints []string
126+
127+
// If the start and end string ranges are equal with padding, no splitting is
128+
// necessary. In such cases, an empty array of split ranges is returned.
129+
if rs.isRangeEqualWithPadding(startRangeCharArray, endRangeCharArray) {
130+
return splitPoints, nil
131+
}
132+
// Convert the range strings from base-N to base-10 and employ a greedy approach
133+
// to determine the smallest splittable integer range difference.
134+
minimalIntRange, err := rs.convertStringRangeToMinimalIntRange(
135+
startRangeCharArray, endRangeCharArray, numSplits)
136+
if err != nil {
137+
return nil, fmt.Errorf("range splitting with start range %q and end range %q: %v",
138+
startRange, endRange, err)
139+
}
140+
141+
// Generate the split points and return them.
142+
splitPoints = rs.generateSplits(generateSplitsOpts{
143+
startRange: startRange,
144+
endRange: endRange,
145+
numSplits: numSplits,
146+
minimalIntRange: minimalIntRange,
147+
})
148+
149+
return splitPoints, nil
150+
}
151+
152+
// generateSplits generates the split points by translating the start and end
153+
// range strings into base-10 integers, performing a split within the integer
154+
// domain, and then converting the splits back into strings. In essence, this
155+
// operation resembles a base-N to base-10 conversion, followed by a split in
156+
// base 10, and finally another base-10 to base-N conversion. In this scenario,
157+
// N represents the size of the alphabet, with the character's position in the
158+
// alphabet indicating the digit's value.
159+
func (rs *rangeSplitter) generateSplits(opts generateSplitsOpts) []string {
160+
161+
startInteger := opts.minimalIntRange.startInteger
162+
endInteger := opts.minimalIntRange.endInteger
163+
minimalLength := opts.minimalIntRange.minimalLength
164+
165+
rangeDifference := new(big.Int).Sub(endInteger, startInteger)
166+
167+
var splitPoints []string
168+
169+
// The number of intervals is one more than the number of split points.
170+
rangeInterval := new(big.Int).SetInt64(int64(opts.numSplits + 1))
171+
172+
for i := 1; i <= opts.numSplits; i++ {
173+
// Combine the range interval and index to determine the split point in base-10 form.
174+
rangeDiffWithIdx := new(big.Int).Mul(rangeDifference, big.NewInt(int64(i)))
175+
rangeInterval := new(big.Int).Div(rangeDiffWithIdx, rangeInterval)
176+
splitPoint := new(big.Int).Add(rangeInterval, startInteger)
177+
178+
// Convert the split point back from base-10 to base-N.
179+
splitString := rs.convertIntToString(splitPoint, minimalLength)
180+
181+
// Due to the approximate nature on how the minimal int range is derived, we need to perform
182+
// another validation to check to ensure each split point falls in valid range.
183+
isGreaterThanStart := len(splitString) > 0 && splitString > opts.startRange
184+
isLessThanEnd := len(opts.endRange) == 0 || (len(splitString) > 0 && splitString < opts.endRange)
185+
if isGreaterThanStart && isLessThanEnd {
186+
splitPoints = append(splitPoints, splitString)
187+
}
188+
}
189+
return splitPoints
190+
}
191+
192+
// sortAlphabet sorts the alphabets string lexicographically.
193+
func sortAlphabet(unsortedAlphabet []rune) {
194+
sort.Slice(unsortedAlphabet, func(i, j int) bool {
195+
return unsortedAlphabet[i] < unsortedAlphabet[j]
196+
})
197+
}
198+
199+
// constructAlphabetMap constructs a mapping from each character in the
200+
// alphabets to its index in the alphabet array.
201+
func constructAlphabetMap(alphabet []rune) map[rune]int {
202+
alphabetMap := make(map[rune]int)
203+
for i, char := range alphabet {
204+
alphabetMap[char] = i
205+
}
206+
return alphabetMap
207+
}
208+
209+
// addCharsToAlphabet adds a character to the known alphabet.
210+
func (rs *rangeSplitter) addCharsToAlphabet(characters []rune) {
211+
rs.mu.Lock() // Acquire the lock
212+
defer rs.mu.Unlock() // Release the lock when the function exits
213+
allAlphabet := rs.sortedAlphabet
214+
newChars := false
215+
for _, char := range characters {
216+
if _, exists := rs.alphabetMap[char]; !exists {
217+
allAlphabet = append(allAlphabet, char)
218+
newChars = true
219+
}
220+
}
221+
if newChars {
222+
sortAlphabet(allAlphabet)
223+
rs.sortedAlphabet = allAlphabet
224+
rs.alphabetMap = constructAlphabetMap(rs.sortedAlphabet)
225+
}
226+
}
227+
228+
// isRangeEqualWithPadding checks if two range strings are identical. Equality
229+
// encompasses any padding using the smallest alphabet character from the set.
230+
func (rs *rangeSplitter) isRangeEqualWithPadding(startRange, endRange []rune) bool {
231+
232+
sortedAlphabet := rs.sortedAlphabet
233+
234+
// When the end range is unspecified, it's interpreted as a sequence of the
235+
// highest possible characters. Consequently, they are not deemed equal.
236+
// If start range has highest possible characters, then smaller characters
237+
// are appended to start range to find split points.
238+
if len(endRange) == 0 {
239+
return false
240+
}
241+
242+
// Get the longer length of the two range strings.
243+
maxLength := max(len(startRange), len(endRange))
244+
245+
smallestChar := sortedAlphabet[0]
246+
247+
// Loop through the string range.
248+
for i := 0; i < maxLength; i++ {
249+
250+
// In cases where a character is absent at a specific position (due to a length
251+
// difference), the position is padded with the smallest character in the alphabet.
252+
charStart := charAtOrDefault(startRange, i, smallestChar)
253+
charEnd := charAtOrDefault(endRange, i, smallestChar)
254+
255+
// As soon as we find a difference, we conclude the two strings are different.
256+
if charStart != charEnd {
257+
return false
258+
}
259+
}
260+
// Otherwise, we conclude the two strings are equal.
261+
return true
262+
}
263+
264+
// charAtOrDefault returns the character at the specified position, or the default character if
265+
// the position is out of bounds.
266+
func charAtOrDefault(charArray []rune, position int, defaultChar rune) rune {
267+
if position < 0 || position >= len(charArray) {
268+
return defaultChar
269+
}
270+
return (charArray)[position]
271+
}
272+
273+
// convertStringRangeToMinimalIntRange gradually extends the start and end string
274+
// range in base-10 representation, until the difference reaches a threshold
275+
// suitable for splitting.
276+
func (rs *rangeSplitter) convertStringRangeToMinimalIntRange(
277+
startRange, endRange []rune, numSplits int) (*minimalIntRange, error) {
278+
279+
startInteger := big.NewInt(0)
280+
endInteger := big.NewInt(0)
281+
282+
alphabetLength := len(rs.sortedAlphabet)
283+
startChar := (rs.sortedAlphabet)[0]
284+
endChar := (rs.sortedAlphabet)[alphabetLength-1]
285+
286+
endDefaultChar := startChar
287+
if len(endRange) == 0 {
288+
endDefaultChar = endChar
289+
}
290+
291+
for i := 0; ; i++ {
292+
293+
// Convert each character of the start range string into a big integer
294+
// based on the alphabet system.
295+
startPosition, err := rs.charPosition(charAtOrDefault(startRange, i, startChar))
296+
if err != nil {
297+
return nil, err
298+
}
299+
startInteger.Mul(startInteger, big.NewInt(int64(alphabetLength)))
300+
startInteger.Add(startInteger, big.NewInt(int64(startPosition)))
301+
302+
// Convert each character of the end range string into a big integer
303+
// based on the alphabet system.
304+
endPosition, err := rs.charPosition(charAtOrDefault(endRange, i, endDefaultChar))
305+
if err != nil {
306+
return nil, err
307+
}
308+
endInteger.Mul(endInteger, big.NewInt(int64(alphabetLength)))
309+
endInteger.Add(endInteger, big.NewInt(int64(endPosition)))
310+
311+
// Calculate the difference between the start and end range in big integer representation.
312+
difference := new(big.Int).Sub(endInteger, startInteger)
313+
314+
// If the difference is bigger than the number of split points, we are done.
315+
// In particular, the minimal length is one greater than the index (due to zero indexing).
316+
if difference.Cmp(big.NewInt(int64(numSplits))) > 0 {
317+
return &minimalIntRange{
318+
startInteger: startInteger,
319+
endInteger: endInteger,
320+
minimalLength: i + 1,
321+
}, nil
322+
}
323+
}
324+
}
325+
326+
// charPosition returns the index of the character in the alphabet set.
327+
func (rs *rangeSplitter) charPosition(ch rune) (int, error) {
328+
if idx, ok := rs.alphabetMap[ch]; ok {
329+
return idx, nil
330+
}
331+
return -1, fmt.Errorf("character %c is not found in the alphabet map %v", ch, rs.alphabetMap)
332+
}
333+
334+
// convertRangeStringToArray transforms the range string into a rune slice while
335+
// verifying the presence of each character in the alphabets.
336+
func (rs *rangeSplitter) convertRangeStringToArray(rangeString string) ([]rune, error) {
337+
for _, char := range rangeString {
338+
if _, exists := rs.alphabetMap[char]; !exists {
339+
return nil, fmt.Errorf("character %c in range string %q is not found in the alphabet array", char, rangeString)
340+
}
341+
}
342+
characterArray := []rune(rangeString)
343+
return characterArray, nil
344+
}
345+
346+
// convertIntToString converts the split point from base-10 to base-N.
347+
func (rs *rangeSplitter) convertIntToString(splitPoint *big.Int, stringLength int) string {
348+
349+
remainder := new(big.Int)
350+
351+
var splitChar []rune
352+
alphabetSize := big.NewInt(int64(len(rs.sortedAlphabet)))
353+
354+
// Iterate through the split point and convert alphabet by alphabet.
355+
for i := 0; i < stringLength; i++ {
356+
remainder.Mod(splitPoint, alphabetSize)
357+
splitPoint.Div(splitPoint, alphabetSize)
358+
splitChar = append(splitChar, (rs.sortedAlphabet)[(int)(remainder.Int64())])
359+
}
360+
361+
// Reverse the converted alphabet order because we originally processed from right to left.
362+
for i, j := 0, len(splitChar)-1; i < j; i, j = i+1, j-1 {
363+
splitChar[i], splitChar[j] = splitChar[j], splitChar[i]
364+
}
365+
366+
return string(splitChar)
42367
}

0 commit comments

Comments
 (0)