1515package dataflux
1616
1717import (
18+ "fmt"
19+ "math/big"
20+ "sort"
1821 "sync"
1922)
2023
2124// rangeSplitter specifies the a list and a map of sorted alphabets.
2225type rangeSplitter struct {
2326 mu sync.Mutex
24- sortedAlphabet * []rune
27+ sortedAlphabet []rune
2528 alphabetMap map [rune ]int
2629}
2730
@@ -31,12 +34,334 @@ type listRange struct {
3134 endRange string
3235}
3336
37+ // minimalIntRange specifies start and end range in base-10 form, along with the
38+ // minimal string length for the split range strings.
39+ type minimalIntRange struct {
40+ startInteger * big.Int
41+ endInteger * big.Int
42+ minimalLength int
43+ }
44+
45+ // generateSplitsOpts specifies the parameters needed to generate the split
46+ // range strings.
47+ type generateSplitsOpts struct {
48+ minimalIntRange * minimalIntRange
49+ numSplits int
50+ startRange string
51+ endRange string
52+ }
53+
3454// newRangeSplitter creates a new RangeSplitter with the given alphabets.
35- func newRangeSplitter (alphabet string ) * rangeSplitter {
36- return & rangeSplitter {}
55+ // RangeSplitter determines split points within a given range based on the given
56+ // alphabets.
57+ func newRangeSplitter (alphabet string ) (* rangeSplitter , error ) {
58+
59+ // Validate that we do not have empty alphabet passed in.
60+ if len (alphabet ) == 0 {
61+ return nil , fmt .Errorf ("no alphabet specified for the range splitter" )
62+ }
63+ // Sort the alphabet lexicographically and store a mapping of each alphabet
64+ // to its index. We need a mapping for efficient index lookup in later operations.
65+ sortedAlphabet := []rune (alphabet )
66+ sortAlphabet (sortedAlphabet )
67+ alphabetMap := constructAlphabetMap (sortedAlphabet )
68+
69+ return & rangeSplitter {
70+ alphabetMap : alphabetMap ,
71+ sortedAlphabet : sortedAlphabet ,
72+ }, nil
3773}
3874
39- // splitRange creates a given number of splits based on a provided start and end range.
75+ // splitRange divides the provided start and end range into approximately equal
76+ // subranges, returning the split points. An empty slice is returned if suitable
77+ // splits cannot be determined. Please note that this method provides a rough
78+ // estimate of split points, without ensuring precise even partitioning of the range.
79+ // Additionally, the number of found splits might be fewer than requested if the
80+ // algorithm struggles to find sufficient split points. If the start range is empty
81+ // the algorithm assumes it to be sequence of smallest possible character and empty
82+ // end range as sequence of highest possible characters.
83+ //
84+ // For example, sorted alphabet = {"a","b","c","d"}
85+ // Input: startRange= "d", endRange= "", numSplits=2
86+ //
87+ // This will be converted from base-N to base-10 integers.
88+ // While calculating base-10 integer, "a" will be appended to startRange
89+ // and "d" will be appended to endRange until the difference between integers is
90+ // more than number of splits.
91+ // startInteger for "da" = 12, endInteger for "dd" = 15
92+ //
93+ // The splits points will be 13 and 14 in base-10. This will be converted back to
94+ // base-N value and returned as split points:
95+ // {"db","dc"}
96+
4097func (rs * rangeSplitter ) splitRange (startRange , endRange string , numSplits int ) ([]string , error ) {
41- return nil , nil
98+ // Number of splits has to be at least one, otherwise it is not splittable.
99+ if numSplits < 1 {
100+ return nil , fmt .Errorf ("number of splits should be at least 1, got %d" , numSplits )
101+ }
102+
103+ // End range (if specified) has to be lexicographically greater than the start range
104+ // for the range to be valid.
105+ if len (endRange ) != 0 && startRange >= endRange {
106+ return nil , fmt .Errorf ("start range %q cannot be lexicographically greater than end range %q" , startRange , endRange )
107+ }
108+
109+ rs .addCharsToAlphabet ([]rune (startRange ))
110+ rs .addCharsToAlphabet ([]rune (endRange ))
111+
112+ // Validate start range characters and convert into character array form.
113+ startRangeCharArray , err := rs .convertRangeStringToArray (startRange )
114+ if err != nil {
115+ return nil , fmt .Errorf ("unable to convert start range %q to array: %v" , startRange , err )
116+ }
117+
118+ // Validate end range characters and convert into character array form.
119+ endRangeCharArray , err := rs .convertRangeStringToArray (endRange )
120+ if err != nil {
121+ return nil , fmt .Errorf ("unable to convert end range %q to array: %v" , endRange , err )
122+ }
123+
124+ // Construct the final split ranges to be returned.
125+ var splitPoints []string
126+
127+ // If the start and end string ranges are equal with padding, no splitting is
128+ // necessary. In such cases, an empty array of split ranges is returned.
129+ if rs .isRangeEqualWithPadding (startRangeCharArray , endRangeCharArray ) {
130+ return splitPoints , nil
131+ }
132+ // Convert the range strings from base-N to base-10 and employ a greedy approach
133+ // to determine the smallest splittable integer range difference.
134+ minimalIntRange , err := rs .convertStringRangeToMinimalIntRange (
135+ startRangeCharArray , endRangeCharArray , numSplits )
136+ if err != nil {
137+ return nil , fmt .Errorf ("range splitting with start range %q and end range %q: %v" ,
138+ startRange , endRange , err )
139+ }
140+
141+ // Generate the split points and return them.
142+ splitPoints = rs .generateSplits (generateSplitsOpts {
143+ startRange : startRange ,
144+ endRange : endRange ,
145+ numSplits : numSplits ,
146+ minimalIntRange : minimalIntRange ,
147+ })
148+
149+ return splitPoints , nil
150+ }
151+
152+ // generateSplits generates the split points by translating the start and end
153+ // range strings into base-10 integers, performing a split within the integer
154+ // domain, and then converting the splits back into strings. In essence, this
155+ // operation resembles a base-N to base-10 conversion, followed by a split in
156+ // base 10, and finally another base-10 to base-N conversion. In this scenario,
157+ // N represents the size of the alphabet, with the character's position in the
158+ // alphabet indicating the digit's value.
159+ func (rs * rangeSplitter ) generateSplits (opts generateSplitsOpts ) []string {
160+
161+ startInteger := opts .minimalIntRange .startInteger
162+ endInteger := opts .minimalIntRange .endInteger
163+ minimalLength := opts .minimalIntRange .minimalLength
164+
165+ rangeDifference := new (big.Int ).Sub (endInteger , startInteger )
166+
167+ var splitPoints []string
168+
169+ // The number of intervals is one more than the number of split points.
170+ rangeInterval := new (big.Int ).SetInt64 (int64 (opts .numSplits + 1 ))
171+
172+ for i := 1 ; i <= opts .numSplits ; i ++ {
173+ // Combine the range interval and index to determine the split point in base-10 form.
174+ rangeDiffWithIdx := new (big.Int ).Mul (rangeDifference , big .NewInt (int64 (i )))
175+ rangeInterval := new (big.Int ).Div (rangeDiffWithIdx , rangeInterval )
176+ splitPoint := new (big.Int ).Add (rangeInterval , startInteger )
177+
178+ // Convert the split point back from base-10 to base-N.
179+ splitString := rs .convertIntToString (splitPoint , minimalLength )
180+
181+ // Due to the approximate nature on how the minimal int range is derived, we need to perform
182+ // another validation to check to ensure each split point falls in valid range.
183+ isGreaterThanStart := len (splitString ) > 0 && splitString > opts .startRange
184+ isLessThanEnd := len (opts .endRange ) == 0 || (len (splitString ) > 0 && splitString < opts .endRange )
185+ if isGreaterThanStart && isLessThanEnd {
186+ splitPoints = append (splitPoints , splitString )
187+ }
188+ }
189+ return splitPoints
190+ }
191+
192+ // sortAlphabet sorts the alphabets string lexicographically.
193+ func sortAlphabet (unsortedAlphabet []rune ) {
194+ sort .Slice (unsortedAlphabet , func (i , j int ) bool {
195+ return unsortedAlphabet [i ] < unsortedAlphabet [j ]
196+ })
197+ }
198+
199+ // constructAlphabetMap constructs a mapping from each character in the
200+ // alphabets to its index in the alphabet array.
201+ func constructAlphabetMap (alphabet []rune ) map [rune ]int {
202+ alphabetMap := make (map [rune ]int )
203+ for i , char := range alphabet {
204+ alphabetMap [char ] = i
205+ }
206+ return alphabetMap
207+ }
208+
209+ // addCharsToAlphabet adds a character to the known alphabet.
210+ func (rs * rangeSplitter ) addCharsToAlphabet (characters []rune ) {
211+ rs .mu .Lock () // Acquire the lock
212+ defer rs .mu .Unlock () // Release the lock when the function exits
213+ allAlphabet := rs .sortedAlphabet
214+ newChars := false
215+ for _ , char := range characters {
216+ if _ , exists := rs .alphabetMap [char ]; ! exists {
217+ allAlphabet = append (allAlphabet , char )
218+ newChars = true
219+ }
220+ }
221+ if newChars {
222+ sortAlphabet (allAlphabet )
223+ rs .sortedAlphabet = allAlphabet
224+ rs .alphabetMap = constructAlphabetMap (rs .sortedAlphabet )
225+ }
226+ }
227+
228+ // isRangeEqualWithPadding checks if two range strings are identical. Equality
229+ // encompasses any padding using the smallest alphabet character from the set.
230+ func (rs * rangeSplitter ) isRangeEqualWithPadding (startRange , endRange []rune ) bool {
231+
232+ sortedAlphabet := rs .sortedAlphabet
233+
234+ // When the end range is unspecified, it's interpreted as a sequence of the
235+ // highest possible characters. Consequently, they are not deemed equal.
236+ // If start range has highest possible characters, then smaller characters
237+ // are appended to start range to find split points.
238+ if len (endRange ) == 0 {
239+ return false
240+ }
241+
242+ // Get the longer length of the two range strings.
243+ maxLength := max (len (startRange ), len (endRange ))
244+
245+ smallestChar := sortedAlphabet [0 ]
246+
247+ // Loop through the string range.
248+ for i := 0 ; i < maxLength ; i ++ {
249+
250+ // In cases where a character is absent at a specific position (due to a length
251+ // difference), the position is padded with the smallest character in the alphabet.
252+ charStart := charAtOrDefault (startRange , i , smallestChar )
253+ charEnd := charAtOrDefault (endRange , i , smallestChar )
254+
255+ // As soon as we find a difference, we conclude the two strings are different.
256+ if charStart != charEnd {
257+ return false
258+ }
259+ }
260+ // Otherwise, we conclude the two strings are equal.
261+ return true
262+ }
263+
264+ // charAtOrDefault returns the character at the specified position, or the default character if
265+ // the position is out of bounds.
266+ func charAtOrDefault (charArray []rune , position int , defaultChar rune ) rune {
267+ if position < 0 || position >= len (charArray ) {
268+ return defaultChar
269+ }
270+ return (charArray )[position ]
271+ }
272+
273+ // convertStringRangeToMinimalIntRange gradually extends the start and end string
274+ // range in base-10 representation, until the difference reaches a threshold
275+ // suitable for splitting.
276+ func (rs * rangeSplitter ) convertStringRangeToMinimalIntRange (
277+ startRange , endRange []rune , numSplits int ) (* minimalIntRange , error ) {
278+
279+ startInteger := big .NewInt (0 )
280+ endInteger := big .NewInt (0 )
281+
282+ alphabetLength := len (rs .sortedAlphabet )
283+ startChar := (rs .sortedAlphabet )[0 ]
284+ endChar := (rs .sortedAlphabet )[alphabetLength - 1 ]
285+
286+ endDefaultChar := startChar
287+ if len (endRange ) == 0 {
288+ endDefaultChar = endChar
289+ }
290+
291+ for i := 0 ; ; i ++ {
292+
293+ // Convert each character of the start range string into a big integer
294+ // based on the alphabet system.
295+ startPosition , err := rs .charPosition (charAtOrDefault (startRange , i , startChar ))
296+ if err != nil {
297+ return nil , err
298+ }
299+ startInteger .Mul (startInteger , big .NewInt (int64 (alphabetLength )))
300+ startInteger .Add (startInteger , big .NewInt (int64 (startPosition )))
301+
302+ // Convert each character of the end range string into a big integer
303+ // based on the alphabet system.
304+ endPosition , err := rs .charPosition (charAtOrDefault (endRange , i , endDefaultChar ))
305+ if err != nil {
306+ return nil , err
307+ }
308+ endInteger .Mul (endInteger , big .NewInt (int64 (alphabetLength )))
309+ endInteger .Add (endInteger , big .NewInt (int64 (endPosition )))
310+
311+ // Calculate the difference between the start and end range in big integer representation.
312+ difference := new (big.Int ).Sub (endInteger , startInteger )
313+
314+ // If the difference is bigger than the number of split points, we are done.
315+ // In particular, the minimal length is one greater than the index (due to zero indexing).
316+ if difference .Cmp (big .NewInt (int64 (numSplits ))) > 0 {
317+ return & minimalIntRange {
318+ startInteger : startInteger ,
319+ endInteger : endInteger ,
320+ minimalLength : i + 1 ,
321+ }, nil
322+ }
323+ }
324+ }
325+
326+ // charPosition returns the index of the character in the alphabet set.
327+ func (rs * rangeSplitter ) charPosition (ch rune ) (int , error ) {
328+ if idx , ok := rs .alphabetMap [ch ]; ok {
329+ return idx , nil
330+ }
331+ return - 1 , fmt .Errorf ("character %c is not found in the alphabet map %v" , ch , rs .alphabetMap )
332+ }
333+
334+ // convertRangeStringToArray transforms the range string into a rune slice while
335+ // verifying the presence of each character in the alphabets.
336+ func (rs * rangeSplitter ) convertRangeStringToArray (rangeString string ) ([]rune , error ) {
337+ for _ , char := range rangeString {
338+ if _ , exists := rs .alphabetMap [char ]; ! exists {
339+ return nil , fmt .Errorf ("character %c in range string %q is not found in the alphabet array" , char , rangeString )
340+ }
341+ }
342+ characterArray := []rune (rangeString )
343+ return characterArray , nil
344+ }
345+
346+ // convertIntToString converts the split point from base-10 to base-N.
347+ func (rs * rangeSplitter ) convertIntToString (splitPoint * big.Int , stringLength int ) string {
348+
349+ remainder := new (big.Int )
350+
351+ var splitChar []rune
352+ alphabetSize := big .NewInt (int64 (len (rs .sortedAlphabet )))
353+
354+ // Iterate through the split point and convert alphabet by alphabet.
355+ for i := 0 ; i < stringLength ; i ++ {
356+ remainder .Mod (splitPoint , alphabetSize )
357+ splitPoint .Div (splitPoint , alphabetSize )
358+ splitChar = append (splitChar , (rs .sortedAlphabet )[(int )(remainder .Int64 ())])
359+ }
360+
361+ // Reverse the converted alphabet order because we originally processed from right to left.
362+ for i , j := 0 , len (splitChar )- 1 ; i < j ; i , j = i + 1 , j - 1 {
363+ splitChar [i ], splitChar [j ] = splitChar [j ], splitChar [i ]
364+ }
365+
366+ return string (splitChar )
42367}
0 commit comments