# HG changeset patch # User David Scott # Date 1279751378 -3600 # Node ID d052a41ffabe74f791a1d62bd178a1cac13dc332 # Parent 542efa53c3e25342a175ead7f9327b972f976820 Add functions Zerocheck.get_next{zero,nonzero} which can be used to detect sparseness (relatively efficiently). Both functions primarily use aligned unsigned int-sized accesses. Signed-off-by: David Scott diff -r 542efa53c3e2 -r d052a41ffabe stdext/zerocheck.ml --- a/stdext/zerocheck.ml Wed Jul 21 23:29:15 2010 +0100 +++ b/stdext/zerocheck.ml Wed Jul 21 23:29:38 2010 +0100 @@ -12,3 +12,30 @@ * GNU Lesser General Public License for more details. *) external is_all_zeros : string -> int -> bool = "is_all_zeros" + +external _find_a_nonzero : string -> int -> int -> int = "find_a_nonzero" +external _find_a_zero : string -> int -> int -> int = "find_a_zero" + +let wrap f x len offset = + let remaining = len - offset in + if remaining <= 0 then raise (Invalid_argument "offset > length"); + let result = f x offset remaining in + if result = remaining then None else Some (result + offset) + +let find_a_nonzero = wrap _find_a_nonzero +let find_a_zero = wrap _find_a_zero + +let fold_over_nonzeros x len roundup f initial = + let rec inner acc offset = + if offset = len then acc + else + match find_a_nonzero x len offset with + | None -> acc (* no more *) + | Some s -> + let e = match find_a_zero x len s with + | None -> len + | Some e -> e in + let e = min len (roundup e) in + inner (f acc (s, e - s)) e in + inner initial 0 + diff -r 542efa53c3e2 -r d052a41ffabe stdext/zerocheck.mli --- a/stdext/zerocheck.mli Wed Jul 21 23:29:15 2010 +0100 +++ b/stdext/zerocheck.mli Wed Jul 21 23:29:38 2010 +0100 @@ -11,4 +11,24 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. *) + +(** [is_all_zeroes x len] returns true if the substring is all zeroes *) external is_all_zeros : string -> int -> bool = "is_all_zeros" + +(** [find_a_zero x len offset] returns the offset in [x] of a zero + character after [offset], or None if no zero was detected. + Note this function is approximate and is not guaranteed to find + strictly the first zero. *) +val find_a_zero: string -> int -> int -> int option + +(** [find_a_nonzero x len offset] returns the offset in [x] of a + nonzero character after [offset], or None if none could be detected. + Note this function is approximate and is not guaranteed to find + strictly the first nonzero. *) +val find_a_nonzero: string -> int -> int -> int option + +(** [fold_over_nonzeros buf len roundup f initial] folds [f] over all + (start, length) pairs of non-zero data in string [buf] up to [len]. + The end offset of each pair is rounded up with [roundup] (e.g. to + potential block boudaries. *) +val fold_over_nonzeros: string -> int -> (int -> int) -> ('a -> int * int -> 'a) -> 'a -> 'a diff -r 542efa53c3e2 -r d052a41ffabe stdext/zerocheck_stub.c --- a/stdext/zerocheck_stub.c Wed Jul 21 23:29:15 2010 +0100 +++ b/stdext/zerocheck_stub.c Wed Jul 21 23:29:38 2010 +0100 @@ -18,6 +18,88 @@ #include #include +#define OFFSET(s) (((unsigned int)s) & (sizeof(unsigned int) - 1)) + + +/* Return the offset of the next non-zero byte (possibly rounded down a bit). + The value 'remaining' is returned if there is no non-zero byte found. */ +value find_a_nonzero(value string, value offset, value remaining) +{ + CAMLparam3(string, offset, remaining); + int c_offset = Int_val(offset); + int c_remaining = Int_val(remaining); + int c_origremaining = c_remaining; + char *c_string = String_val(string); + char *s = c_string + c_offset; + + /* Go character by character until we hit an unsigned int boundary */ + while ((OFFSET(s) != 0) && (c_remaining > 0)){ + if (*s != '\000') goto finish; + s++; c_remaining--; + } + /* Go word by word. Note we don't need to determine the exact position + of the nonzero, it suffices to return the index of the word containing + the nonzero. */ + unsigned int *p = (unsigned int *)s; + while (c_remaining > 4){ + if (*p != 0) goto finish; + p++; c_remaining-=4; + } + /* Go character by character until the end of the string */ + s = (char*) p; + while (c_remaining > 0){ + if (*s != '\000') goto finish; + s++; c_remaining--; + } + /* c_remaining == 0 */ + finish: + /* If we didn't find a nonzero then we return c_origremaining. + If we did then we return the number of chars after the starting + offset where the word containing the nonzero was detected. */ + CAMLreturn(Val_int(c_origremaining - c_remaining)); + +} + +/* Return the offset of the next zero byte (possibly rounded up a bit). + The value 'remaining' is returned if there is no zero byte found. */ +value find_a_zero(value string, value offset, value remaining) +{ + CAMLparam3(string, offset, remaining); + int c_offset = Int_val(offset); + int c_remaining = Int_val(remaining); + int c_origremaining = c_remaining; + char *c_string = String_val(string); + char *s = c_string + c_offset; + + /* Go character by character until we hit an unsigned int boundary */ + while ((OFFSET(s) != 0) && (c_remaining > 0)){ + if (*s == '\000') goto finish; + s++; c_remaining--; + } + /* Go word by word. Note we don't need to determine the exact position + of the zero, it suffices to return the index of the word following + the zero. */ + unsigned int *p = (unsigned int *)s; + while (c_remaining > 4){ + if (*p == 0) goto finish; + p++; c_remaining-=4; + } + /* Go character by character until the end of the string */ + s = (char*) p; + while (c_remaining > 0){ + if (*s == '\000') goto finish; + s++; c_remaining--; + } + /* c_remaining == 0 */ + finish: + /* If we didn't find a zero then we return c_origremaining. + If we did then we return the number of chars after the starting + offset where the word containing the zero was detected. */ + CAMLreturn(Val_int(c_origremaining - c_remaining)); +} + + + /* for better performance in all case, we should process the unalign data at * the beginning until we reach a 32 bit align value, however since ocaml * allocate the string and we don't use any offset in this string, the string