/*
This file was generated by ./make-single.pl
from /usr/home/ben/projects/text-fuzzy/text-fuzzy.c,
ed-trans-char.c, ed-trans-int.c, edit-distance-char.c, edit-distance-int.c, and
/usr/home/ben/projects/text-fuzzy/text-fuzzy.h.
*/
#line 1 "text-fuzzy.c"
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#ifdef HEADER
#ifndef ERROR_HANDLER
#define ERROR_HANDLER text_fuzzy_error_handler
#endif /* undef ERROR_HANDLER */
#ifndef ERROR_HANDLER_H
#define ERROR_HANDLER_H
typedef int (* error_handler_t) (const char * source_file,
int source_line_number,
const char * message, ...)
__attribute__ ((format (printf, 3, 4)));
#endif /* ndef ERROR_HANDLER_H */
extern error_handler_t text_fuzzy_error_handler;
#endif /* def HEADER */
#ifndef ERROR_HANDLER_H
#define ERROR_HANDLER_H
typedef int (* error_handler_t) (const char * source_file,
int source_line_number,
const char * message, ...)
__attribute__ ((format (printf, 3, 4)));
#endif /* ndef ERROR_HANDLER_H */
/* This is the default error handler for this namespace. */
static int
text_fuzzy_default_error_handler (const char * source_file,
int source_line_number,
const char * message, ...)
{
va_list args;
fprintf (stderr, "%s:%d: ", source_file, source_line_number);
va_start (args, message);
vfprintf (stderr, message, args);
fprintf (stderr, "\n");
return 0;
}
/* This global variable is the error handler for this namespace. */
error_handler_t text_fuzzy_error_handler =
text_fuzzy_default_error_handler;
/* Print an error message for a failed condition "condition" at the
appropriate line. */
#define LINE_ERROR(condition, status) \
if (text_fuzzy_error_handler) { \
(* text_fuzzy_error_handler) \
(__FILE__, __LINE__, \
"Failed test '%s', returning status '%s': %s", \
#condition, #status, \
text_fuzzy_statuses \
[text_fuzzy_status_ ## status]); \
}
/* Fail a test, without message. */
#define FAIL(condition, status) \
if (condition) { \
LINE_ERROR (condition, status); \
return text_fuzzy_status_ ## status; \
}
/* Fail a test, with message. */
#define FAIL_MSG(condition, status, msg, args...) \
if (condition) { \
LINE_ERROR (condition, status); \
if (text_fuzzy_error_handler) { \
(* text_fuzzy_error_handler) \
(__FILE__, __LINE__, \
msg, ## args); \
} \
return text_fuzzy_status_ ## status; \
}
#define OK return text_fuzzy_status_ok;
/* Call a function and print an error message and return if the
function returns an error value. */
#define CALL(x) { \
text_fuzzy_status_t _status = text_fuzzy_ ## x; \
if (_status != text_fuzzy_status_ok) { \
if (text_fuzzy_error_handler) { \
(* text_fuzzy_error_handler) \
(__FILE__, __LINE__, \
"Call 'text_fuzzy_%s' " \
"failed with status '%d': %s", \
#x, _status, \
text_fuzzy_statuses[_status]); \
} \
return _status; \
} \
}
/*
Local variables:
mode: c
End:
*/
const char * text_fuzzy_statuses[] = {
"normal operation",
"out of memory",
"a pointer contained a zero value",
"open error",
"close error",
"read error",
"line too long",
"There was an attempt to make a Unicode alphabet on a non-Unicode string.",
"max min miscalculation",
"A string for comparison was larger than the value of HUGE defined in the code.",
"An attempt was made to use the maximum edit distance which was unset.",
"miscount",
};
#ifdef VERBOSE
#define MESSAGE(format, args...) { \
printf ("%s:%d: ", __FILE__, __LINE__); \
printf (format, ## args); \
}
#else /* VERBOSE */
#define MESSAGE(format, args...)
#endif /* VERBOSE */
/* Local variables:
mode: c
End:
*/
#ifdef HEADER
extern const char * text_fuzzy_statuses[];
#ifndef ERROR_HANDLER_H
#define ERROR_HANDLER_H
typedef int (* error_handler_t) (const char * source_file,
int source_line_number,
const char * message, ...)
__attribute__ ((format (printf, 3, 4)));
#endif /* ndef ERROR_HANDLER_H */
extern error_handler_t text_fuzzy_error_handler;
#ifndef ERROR_HANDLER
#include <stdio.h>
#include <stdarg.h>
static void default_error_handler (const char * file, int line,
const char * format, ...)
{
va_list a;
va_start (a, format);
fprintf (stderr, "%s:%d ", file, line);
vfprintf (stderr, format, a);
fprintf (stderr, "\n");
va_end (a);
}
#define ERROR_HANDLER default_error_handler
#endif /* ERROR_HANDLER */
#define TEXT_FUZZY(x) { \
text_fuzzy_status_t status; \
status = text_fuzzy_ ## x; \
if (status != text_fuzzy_status_ok) { \
/* Print error and return. */ \
ERROR_HANDLER (__FILE__, __LINE__, \
"Call to %s failed: %s", \
#x, text_fuzzy_statuses[status]); \
return TEXT_FUZZY_USER_ERROR; \
} \
}
/*
Local variables:
mode: c
End:
*/
typedef enum {
text_fuzzy_status_ok,
text_fuzzy_status_memory_failure,
text_fuzzy_status_null_pointer,
text_fuzzy_status_open_error,
text_fuzzy_status_close_error,
text_fuzzy_status_read_error,
text_fuzzy_status_line_too_long,
text_fuzzy_status_ualphabet_on_non_unicode,
text_fuzzy_status_max_min_miscalculation,
text_fuzzy_status_string_too_long,
text_fuzzy_status_max_distance_misuse,
text_fuzzy_status_miscount,
}
text_fuzzy_status_t;
#endif /* def HEADER */
#line 1 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <stdint.h>
#line 1 "/usr/home/ben/projects/text-fuzzy/config.h"
#ifndef TEXT_FUZZY_CONFIG
#define TEXT_FUZZY_CONFIG
#define NO_MAX_DISTANCE -1
#define STRING_MAX_CHARS (0x1000 * 0x1000)
#define UALPHABET_MAX_SIZE 0x10000
#endif /* ndef TEXT_FUZZY_CONFIG */
#line 1 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.h"
/*
This file was generated by the following command:
cfunctions text-fuzzy.c
*/
#ifndef CFH_TEXT_FUZZY_H
#define CFH_TEXT_FUZZY_H
#line 4 "text-fuzzy.c"
#ifndef ERROR_HANDLER
#define ERROR_HANDLER text_fuzzy_error_handler
#endif /* undef ERROR_HANDLER */
#ifndef ERROR_HANDLER_H
#define ERROR_HANDLER_H
typedef int (* error_handler_t) (const char * source_file,
int source_line_number,
const char * message, ...)
__attribute__ ((format (printf, 3, 4)));
#endif /* ndef ERROR_HANDLER_H */
#line 16 "text-fuzzy.c"
extern error_handler_t text_fuzzy_error_handler;
#line 45 "text-fuzzy.c"
extern error_handler_t text_fuzzy_error_handler;
#line 115 "text-fuzzy.c"
extern const char *text_fuzzy_statuses[];
#line 128 "text-fuzzy.c"
#line 129 "text-fuzzy.c"
extern const char *text_fuzzy_statuses[];
#ifndef ERROR_HANDLER_H
#define ERROR_HANDLER_H
typedef int (* error_handler_t) (const char * source_file,
int source_line_number,
const char * message, ...)
__attribute__ ((format (printf, 3, 4)));
#endif /* ndef ERROR_HANDLER_H */
#line 137 "text-fuzzy.c"
extern error_handler_t text_fuzzy_error_handler;
#ifndef ERROR_HANDLER
#include <stdio.h>
#include <stdarg.h>
#line 145 "text-fuzzy.c"
static void default_error_handler (const char* file, int line, const char* format, ...)
{
va_list a;
va_start (a, format);
fprintf (stderr, "%s:%d ", file, line);
vfprintf (stderr, format, a);
fprintf (stderr, "\n");
va_end (a);
}
#define ERROR_HANDLER default_error_handler
#endif /* ERROR_HANDLER */
#define TEXT_FUZZY(x) { \
text_fuzzy_status_t status; \
status = text_fuzzy_ ## x; \
if (status != text_fuzzy_status_ok) { \
\
ERROR_HANDLER (__FILE__, __LINE__, \
"Call to %s failed: %s", \
#x, text_fuzzy_statuses[status]); \
return TEXT_FUZZY_USER_ERROR; \
} \
}
typedef enum {
text_fuzzy_status_ok,
text_fuzzy_status_memory_failure,
text_fuzzy_status_null_pointer,
text_fuzzy_status_open_error,
text_fuzzy_status_close_error,
text_fuzzy_status_read_error,
text_fuzzy_status_line_too_long,
text_fuzzy_status_ualphabet_on_non_unicode,
text_fuzzy_status_max_min_miscalculation,
text_fuzzy_status_string_too_long,
text_fuzzy_status_max_distance_misuse,
text_fuzzy_status_miscount,
}
text_fuzzy_status_t;
#line 10 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
#ifdef __GNUC__
#define STACKALLOCOK 1
#else
#undef STACKALLOCOK
#endif
#ifdef PERL_MEMORY_MANAGEMENT
#define CALLOC(x,n,t) Newxz(x,n,t)
#define REALLOC(x,n,t) Renew(x,n,t)
#define FREE(x) Safefree(x)
#else /* not PERL_MEMORY_MANAGEMENT */
#define CALLOC(x,n,t) { \
x = calloc (n, sizeof (t)); \
if (! x) { \
fprintf (stderr, "%s:%d: calloc %d x %d failed.\n", \
__FILE__, __LINE__, n, (int) sizeof (t)); \
return text_fuzzy_status_memory_failure; \
} \
}
#define REALLOC(x,n,t) x = realloc (x, n * sizeof (t))
#define FREE(x) free (x)
#endif /* def PERL_MEMORY_MANAGEMENT */
#line 40 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
typedef struct ualphabet
{
/* The smallest character in our alphabet. */
int min;
/* The largest character in our alphabet. */
int max;
/* Number of chars allocated in the following array. */
int size;
/* Array containing Unicode alphabet, as a bitmap. */
unsigned char * alphabet;
/* The number of characters which were rejected using the Unicode
alphabet. */
int rejections;
}
ualphabet_t;
typedef struct text_fuzzy_string
{
/* The text of the string. */
const char * text;
/* The length of "text". */
int length;
/* The characters of "text" expanded out into unicode
characters. */
int * unicode;
/* The length of "unicode". */
int ulength;
/* Is "text" allocated? */
unsigned int allocated : 1;
}
text_fuzzy_string_t;
typedef struct candidate candidate_t;
struct candidate{
int distance;
int offset;
candidate_t * next;
}
#line 60 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
;
typedef struct
{
int dic[UINT8_MAX];
}
adic_t;
typedef struct item
{
/* The character. */
int key;
/* The position of the character in the first string. */
unsigned int position;
}
idic_item_t;
typedef struct
{
/* Allocated by size of dictionary. */
idic_item_t * items;
/* Largest value. */
int max;
}
idic_t;
typedef enum
{
/* Use zero value as invalid, so we know if we haven't done things
* properly. */
tf_invalid,
/* Keep, insert, replace, delete for standard Lev. edits. */
tf_keep,
tf_insert,
tf_replace,
tf_delete,
/* Transpose THIS char with the NEXT char. */
tf_transpose,
}
tf_edit_t;
typedef struct text_fuzzy
{
/* The string we are to match. */
text_fuzzy_string_t text;
/* The matching string. */
text_fuzzy_string_t b;
/* The maximum edit distance we allow for. */
int max_distance;
/* The maximum edit distance the user will allow. We are going to
cheat and ignore the user's value. */
int max_distance_holder;
/* The number of mallocs we are guilty of. */
int n_mallocs;
/* ASCII alphabet */
int alphabet[0x100];
/* The number of characters which were rejected using the ASCII
alphabet. */
int alphabet_rejections;
/* Unicode alphabet. */
ualphabet_t ualphabet;
/* The minimum distance we got in our most recent effort. */
int distance;
/* The number of units allocated for "b.unicode". This is not the
string length. This is used when deciding whether there is
sufficient space to store a test string. */
int b_unicode_length;
/* The number of items which have been rejected because the length
difference is bigger than the maximum edit distance. */
int length_rejections;
/* A character which is not in use. */
unsigned char invalid_char;
/* Candidates for an array match. */
candidate_t first;
candidate_t * last;
/* When scanning an array, put the index of the element of the
array into "text_fuzzy->offset". The offset of the nearest
elements are preserved in the "candidate_t" linked list which
starts off with "text_fuzzy".
There is currently no sanity check, so if the user forgets to
set "offset" each time around the loop, the code will not
notice anything amiss and just send a list of zeros back to the
user. */
int offset;
/* ASCII match dictionary for transposition searches. */
adic_t adic;
/* Unicode match dictionary (integer) for transposition
searches. */
idic_t idic;
/* The edits required to turn "text" into "b". */
tf_edit_t * edits;
int edits_size;
int n_edit;
/* Does the user want to use an alphabet filter? Default is yes,
so this must be set to a non-zero value to switch off use. */
unsigned int user_no_alphabet : 1;
/* Are we actually going to use it? (This may be false even if the
user wants to use it, for silly cases, but is not true if the
user does not want to use it.) */
unsigned int use_alphabet : 1;
unsigned int use_ualphabet : 1;
/* Variable edit costs? (currently unused) */
unsigned int variable_edit_costs : 1;
/* Do we account for transpositions? */
unsigned int transpositions_ok : 1;
/* Did we find it? */
unsigned int found : 1;
/* Is this Unicode? */
unsigned int unicode : 1;
/* Do we want to skip exact matches? */
unsigned int no_exact : 1;
/* Are we scanning a list of entries? */
unsigned int scanning : 1;
/* Do we want an array of answers? */
unsigned int wantarray : 1;
/* Has idic been built for this string? */
unsigned int idic_ok : 1;
}
text_fuzzy_t;
#define TEXT_FUZZY_INVALID_UNICODE_LENGTH -1
#line 90 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int minimum (int a, int b);
#line 191 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int idic_free_dic (idic_t* dic);
#line 211 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int idic_find (idic_t* dic, int key);
#line 224 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int idic_set (idic_t* dic, int key, int position);
#line 241 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int idic_reset (idic_t* dic);
#line 251 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int adic_reset_dic (adic_t* dic);
#line 261 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int adic_find (adic_t* dic, uint8_t key);
#line 267 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
int adic_set (adic_t* dic, uint8_t key, int position);
#line 324 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_generate_ualphabet (text_fuzzy_t* tf);
#line 495 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_compare_single (text_fuzzy_t* tf);
#line 651 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_get_candidates (text_fuzzy_t* text_fuzzy, int* n_candidates_ptr, int** candidates_ptr);
#line 701 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_free_candidates (text_fuzzy_t* text_fuzzy, int* candidates);
#line 716 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_generate_alphabet (text_fuzzy_t* text_fuzzy);
#line 748 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_begin_scanning (text_fuzzy_t* text_fuzzy);
#line 778 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_end_scanning (text_fuzzy_t* text_fuzzy);
#line 862 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_scan_file (text_fuzzy_t* text_fuzzy, char* file_name, char** nearest_ptr, int* nearest_length_ptr);
#line 907 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_scan_file_free (char* nearest);
#line 913 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_alphabet_rejections (text_fuzzy_t* text_fuzzy, int* r);
#line 921 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_free_memory (text_fuzzy_t* text_fuzzy);
#line 942 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_set_max_distance (text_fuzzy_t* text_fuzzy, int max_distance);
#line 948 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_get_max_distance (text_fuzzy_t* text_fuzzy, int* max_distance);
#line 954 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_set_transpositions (text_fuzzy_t* text_fuzzy, int transpositions);
#line 960 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_get_transpositions (text_fuzzy_t* text_fuzzy, int* transpositions);
#line 966 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_last_distance (text_fuzzy_t* text_fuzzy, int* last_distance);
#line 972 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_no_alphabet (text_fuzzy_t* text_fuzzy, int yes_no);
#line 982 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_ualphabet_rejections (text_fuzzy_t* text_fuzzy, int* ualphabet_rejections);
#line 988 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_set_no_exact (text_fuzzy_t* text_fuzzy, int yes_no);
#line 994 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_get_length_rejections (text_fuzzy_t* text_fuzzy, int* length_rejections);
#line 1000 "/usr/home/ben/projects/text-fuzzy/text-fuzzy.c.in"
text_fuzzy_status_t text_fuzzy_get_unicode_length (text_fuzzy_t* text_fuzzy, int* unicode_length);
#endif /* CFH_TEXT_FUZZY_H */
#ifdef HEADER
/* Some people are not using GCC-compatible compilers. See, for
example, this test failure.
http://ppm4.activestate.com/sun4-solaris-64/5.12/1200/B/BK/BKB/Text-Fuzzy-0.11.d/log-20130328T025706.txt */
#ifdef __GNUC__
#define STACKALLOCOK 1
#else
/* User's compiler cannot allocate on the stack. */
#undef STACKALLOCOK
#endif
#ifdef PERL_MEMORY_MANAGEMENT
#define CALLOC(x,n,t) Newxz(x,n,t)
#define REALLOC(x,n,t) Renew(x,n,t)
#define FREE(x) Safefree(x)
#else /* not PERL_MEMORY_MANAGEMENT */
/* Perl's Newxz uses the type of the final argument rather than
directly using that as a size, so to make this macro compatible
with Perl we do the same thing. */
#define CALLOC(x,n,t) { \
x = calloc (n, sizeof (t)); \
if (! x) { \
fprintf (stderr, "%s:%d: calloc %d x %d failed.\n", \
__FILE__, __LINE__, n, (int) sizeof (t)); \
return text_fuzzy_status_memory_failure; \
} \
}
#define REALLOC(x,n,t) x = realloc (x, n * sizeof (t))
#define FREE(x) free (x)
#endif /* def PERL_MEMORY_MANAGEMENT */
#endif /* def HEADER */
#ifdef HEADER
//#define VERBOSE 1
/* Alphabet over unicode characters. */
typedef struct ualphabet
{
/* The smallest character in our alphabet. */
int min;
/* The largest character in our alphabet. */
int max;
/* Number of chars allocated in the following array. */
int size;
/* Array containing Unicode alphabet, as a bitmap. */
unsigned char * alphabet;
/* The number of characters which were rejected using the Unicode
alphabet. */
int rejections;
}
ualphabet_t;
/* This structure contains one string of whatever type. */
typedef struct text_fuzzy_string
{
/* The text of the string. */
const char * text;
/* The length of "text". */
int length;
/* The characters of "text" expanded out into unicode
characters. */
int * unicode;
/* The length of "unicode". */
int ulength;
/* Is "text" allocated? */
unsigned int allocated : 1;
}
text_fuzzy_string_t;
/* Match candidates. */
typedef struct candidate candidate_t;
struct candidate
{
int distance;
int offset;
candidate_t * next;
};
/* Dictionary for transposition character matches. The "a" in "adic_t"
stands for "ascii". */
typedef struct
{
int dic[UINT8_MAX];
}
adic_t;
/* One item in dictionary for transposition integer matches. */
typedef struct item
{
/* The character. */
int key;
/* The position of the character in the first string. */
unsigned int position;
}
idic_item_t;
/* Dictionary of last-seen positions of characters for transposition
integer matches. */
typedef struct
{
/* Allocated by size of dictionary. */
idic_item_t * items;
/* Largest value. */
int max;
}
idic_t;
/* Type of edit applied. */
typedef enum
{
/* Use zero value as invalid, so we know if we haven't done things
* properly. */
tf_invalid,
/* Keep, insert, replace, delete for standard Lev. edits. */
tf_keep,
tf_insert,
tf_replace,
tf_delete,
/* Transpose THIS char with the NEXT char. */
tf_transpose,
}
tf_edit_t;
/* The following structure contains one string plus additional
paraphenalia used in searching for the string, for example the
alphabet of the string. */
typedef struct text_fuzzy
{
/* The string we are to match. */
text_fuzzy_string_t text;
/* The matching string. */
text_fuzzy_string_t b;
/* The maximum edit distance we allow for. */
int max_distance;
/* The maximum edit distance the user will allow. We are going to
cheat and ignore the user's value. */
int max_distance_holder;
/* The number of mallocs we are guilty of. */
int n_mallocs;
/* ASCII alphabet */
int alphabet[0x100];
/* The number of characters which were rejected using the ASCII
alphabet. */
int alphabet_rejections;
/* Unicode alphabet. */
ualphabet_t ualphabet;
/* The minimum distance we got in our most recent effort. */
int distance;
/* The number of units allocated for "b.unicode". This is not the
string length. This is used when deciding whether there is
sufficient space to store a test string. */
int b_unicode_length;
/* The number of items which have been rejected because the length
difference is bigger than the maximum edit distance. */
int length_rejections;
/* A character which is not in use. */
unsigned char invalid_char;
/* Candidates for an array match. */
candidate_t first;
candidate_t * last;
/* When scanning an array, put the index of the element of the
array into "text_fuzzy->offset". The offset of the nearest
elements are preserved in the "candidate_t" linked list which
starts off with "text_fuzzy".
There is currently no sanity check, so if the user forgets to
set "offset" each time around the loop, the code will not
notice anything amiss and just send a list of zeros back to the
user. */
int offset;
/* ASCII match dictionary for transposition searches. */
adic_t adic;
/* Unicode match dictionary (integer) for transposition
searches. */
idic_t idic;
/* The edits required to turn "text" into "b". */
tf_edit_t * edits;
int edits_size;
int n_edit;
/* Does the user want to use an alphabet filter? Default is yes,
so this must be set to a non-zero value to switch off use. */
unsigned int user_no_alphabet : 1;
/* Are we actually going to use it? (This may be false even if the
user wants to use it, for silly cases, but is not true if the
user does not want to use it.) */
unsigned int use_alphabet : 1;
unsigned int use_ualphabet : 1;
/* Variable edit costs? (currently unused) */
unsigned int variable_edit_costs : 1;
/* Do we account for transpositions? */
unsigned int transpositions_ok : 1;
/* Did we find it? */
unsigned int found : 1;
/* Is this Unicode? */
unsigned int unicode : 1;
/* Do we want to skip exact matches? */
unsigned int no_exact : 1;
/* Are we scanning a list of entries? */
unsigned int scanning : 1;
/* Do we want an array of answers? */
unsigned int wantarray : 1;
/* Has idic been built for this string? */
unsigned int idic_ok : 1;
}
text_fuzzy_t;
/* The string is not unicode so its length in unicode characters is
unknown. */
#define TEXT_FUZZY_INVALID_UNICODE_LENGTH -1
#endif /* HEADER */
int
minimum (int a, int b)
{
if (a > b) {
return b;
}
return a;
}
/* Dictionaries for the transposition searches. */
/* For the binary search of dic->items. */
static int
item_comp (const void * a, const void * b)
{
idic_item_t * ai = (idic_item_t *) a;
idic_item_t * bi = (idic_item_t *) b;
return (int) (ai->key) - (int) (bi->key);
}
/* For sorting the elements of worduniq into key order. */
static int
uniqcomp (const void * a, const void * b)
{
return (int) (* (int *) a) - (int) (* (int *) b);
}
// s/stack/dic/g;
/* Set up the array of "dic" to be a list of items sorted by the
characters of "word1". */
static int
idic_init_dic (idic_t * dic, const int * word1, int len1)
{
int i;
int prev;
int top;
#ifdef STACKALLOCOK
int worduniq[len1];
#else
int * worduniq;
#endif
#ifndef STACKALLOCOK
CALLOC (worduniq, len1, int);
#endif
/* Put word1 into a copy then sort the copy to get the unique
elements. */
for (i = 0; i < len1; i++) {
worduniq[i] = word1[i];
}
qsort (worduniq, len1, sizeof (int), uniqcomp);
/* Count uniques. */
dic->max = 0;
/* There should not be any zeros in word1. */
prev = 0;
for (i = 0; i < len1; i++) {
int key;
key = worduniq[i];
if (key == prev) {
continue;
}
dic->max++;
prev = key;
}
/* Fill "items" with the keys. Values are all zero at this point. */
CALLOC (dic->items, dic->max, idic_item_t);
top = 0;
/* There should not be any zeros in word1. */
prev = 0;
for (i = 0; i < len1; i++) {
int key;
key = worduniq[i];
if (key == prev) {
continue;
}
dic->items[top].key = key;
dic->items[top].position = 0;
top++;
if (top > dic->max) {
fprintf (stderr, "%s:%d: dic overflow %d > %d.\n",
__FILE__, __LINE__, top, dic->max);
return -1;
}
prev = key;
}
#ifndef STACKALLOCOK
FREE (worduniq);
#endif
/* Because we sorted worduniq above, dic->items is already
sorted into key order, and we don't need to sort it with
item_comp to use bsearch. */
return 0;
}
int
idic_free_dic (idic_t * dic)
{
FREE (dic->items);
dic->items = 0;
return 0;
}
static idic_item_t *
idic_dicfind (idic_t * dic, int key)
{
idic_item_t s = {0};
s.key = key;
/* If the number of items in dic is very small, it might be smart
to use a linear search. */
return bsearch (& s, dic->items, dic->max,
sizeof (idic_item_t), item_comp);
}
/* Look for the most recent example of the character "key" in the
dic of letters from "word1". If it is not there anywhere, return
0. */
int
idic_find (idic_t * dic, int key)
{
idic_item_t * found;
found = idic_dicfind (dic, key);
if (! found) {
return 0;
}
return found->position;
}
/* Set the value of dic[key] to value. */
int
idic_set (idic_t * dic, int key, int position)
{
idic_item_t * found;
found = idic_dicfind (dic, key);
if (! found) {
fprintf (stderr,
"%s:%d: could not set element %d: not found.\n",
__FILE__, __LINE__, key);
return -1;
}
found->position = position;
return 0;
}
/* Reset this dic for another transposition match. */
int
idic_reset (idic_t * dic)
{
int i;
for (i = 0; i < dic->max; i++) {
dic->items[i].position = 0;
}
return 0;
}
int
adic_reset_dic (adic_t * dic)
{
int i;
for (i = 0; i < UINT8_MAX; i++) {
dic->dic[i] = 0;
}
return 0;
}
int
adic_find (adic_t * dic, uint8_t key)
{
return dic->dic[key];
}
int
adic_set (adic_t * dic, uint8_t key, int position)
{
dic->dic[key] = position;
return position;
}
static text_fuzzy_status_t text_fuzzy_large_edits (text_fuzzy_string_t * string, int * size_ptr)
{
int len;
int size;
len = string->length;
if (string->ulength > len) {
len = string->ulength;
}
len *= 2;
size = 1;
while (len >>= 1) {
size <<= 1;
}
* size_ptr = size;
OK;
}
static text_fuzzy_status_t text_fuzzy_allocate_edits (text_fuzzy_t * text_fuzzy)
{
int edits_size;
CALL (large_edits (& text_fuzzy->text, & edits_size));
CALLOC (text_fuzzy->edits, edits_size, tf_edit_t);
text_fuzzy->n_mallocs++;
text_fuzzy->edits_size = edits_size;
OK;
}
static text_fuzzy_status_t text_fuzzy_resize_edits (text_fuzzy_t * text_fuzzy)
{
if (text_fuzzy->b.length > text_fuzzy->edits_size ||
text_fuzzy->b.ulength > text_fuzzy->edits_size) {
int b_size;
CALL (large_edits (& text_fuzzy->b, & b_size));
REALLOC (text_fuzzy->edits, b_size, tf_edit_t);
}
OK;
}
#line 1 "ed-trans-char.c"
#line 2 "ed-trans.c.tmpl"
/* #include <stdio.h> */
/* #include <limits.h> */
/* #include <stdlib.h> */
/* #include <stdint.h> */
/* #include "config.h" */
/* #include "text-fuzzy.h" */
/* #include "ed-trans-char.h" */
static int distance_char_trans (text_fuzzy_t * tf)
{
/* "tf->text" is the studied string, so we need it to be word1. */
const unsigned char * word1 = (const unsigned char *) tf->text.text;
int len1 = tf->text.length;
const unsigned char * word2 = (const unsigned char *) tf->b.text;
int len2 = tf->b.length;
/* Return value. */
int d;
/* X and Y coordinates in the matrix of strings. */
int i;
int j;
/* Unfeasible value; indicates no match. */
int large_value;
/* Maximum distance we are interested in. */
int max;
int size1 = len1 + 2;
int size2 = len2 + 2;
/* This dictionary tracks the locations of characters from word1
we have seen, for the sake of transpositions. */
adic_t * dic;
#ifdef STACKALLOCOK
unsigned int matrix[size1][size2];
#else
unsigned int ** matrix;
#endif
/* First handle the two extreme cases of one or the other strings
being empty. */
if (len1 == 0) {
return len2;
}
if (len2 == 0) {
return len1;
}
#ifndef STACKALLOCOK
CALLOC (matrix, size1, unsigned int *);
for (i = 0; i < size1; i++) {
CALLOC (matrix[i], size2, unsigned int);
}
#endif /* STACKALLOCOK */
dic = & tf->adic;
max = tf->max_distance;
large_value = len1 + len2;
/* Initialize the dynamic programming matrix's values. */
matrix[0][0] = large_value;
matrix[1][0] = large_value;
for (j = 0; j <= len2; j++) {
matrix[1][j + 1] = j;
matrix[0][j + 1] = large_value;
}
for (i = 1; i <= len1; i++) {
/* Last matching column. */
int lmc;
/* Minimum value on the row. */
int row_min;
unsigned char ic;
ic = word1[i - 1];
matrix[i + 1][1] = i;
matrix[i + 1][0] = large_value;
lmc = 0;
row_min = INT_MAX;
for (j = 1; j <= len2; j++) {
/* Last matching row */
unsigned int lmr;
/* Swap score */
unsigned int ss;
unsigned char jc;
jc = word2[j - 1];
/* See if we can find jc somewhere in "word1". */
lmr = adic_find (dic, jc);
if (lmr > 0) {
/* We have found "jc" at some offset into "word1", so
work out the cost of swapping. */
ss = matrix[lmr][lmc] + i + j - 1 - lmr - lmc;
}
else {
/* Have not found "jc" in "word1", so there is no
possibility of transposition, don't bother
calculating any more. See comment about i > 0
below. */
ss = large_value;
}
if (ic != jc) {
int x;
int y;
/* Insertion, deletion, or replacement. */
x = minimum (matrix[i + 1][j], matrix[i][j + 1]);
y = minimum (matrix[i][j], x);
/* Swapping or one of the above. */
matrix[i + 1][j + 1] = minimum (ss, y + 1);
}
else {
/* Exact match, mark this as the last matching
column. */
lmc = j;
/* Copy the character. */
matrix[i + 1][j + 1] = matrix[i][j];
}
row_min = minimum (row_min, matrix[i + 1][j + 1]);
}
if (max > 0 && row_min > max) {
d = max + 1;
goto cleanup;
}
/* Change the location of ic in the dictionary to be "i" since
that is now the most recent example of it. Notice this sets
i > 0 always, even for the first character. */
adic_set (dic, ic, i);
}
d = matrix[len1 + 1][len2 + 1];
cleanup:
#ifndef STACKALLOCOK
for (i = 0; i < size1; i++) {
FREE (matrix[i]);
matrix[i] = 0;
}
FREE (matrix);
#endif /* STACKALLOCOK */
return d;
}
#line 1 "ed-trans-int.c"
#line 2 "ed-trans.c.tmpl"
/* #include <stdio.h> */
/* #include <limits.h> */
/* #include <stdlib.h> */
/* #include <stdint.h> */
/* #include "config.h" */
/* #include "text-fuzzy.h" */
/* #include "ed-trans-int.h" */
static int distance_int_trans (text_fuzzy_t * tf)
{
/* "tf->text" is the studied string, so we need it to be word1. */
const int * word1 = (const int *) tf->text.unicode;
int len1 = tf->text.ulength;
const int * word2 = (const int *) tf->b.unicode;
int len2 = tf->b.ulength;
/* Return value. */
int d;
/* X and Y coordinates in the matrix of strings. */
int i;
int j;
/* Unfeasible value; indicates no match. */
int large_value;
/* Maximum distance we are interested in. */
int max;
int size1 = len1 + 2;
int size2 = len2 + 2;
/* This dictionary tracks the locations of characters from word1
we have seen, for the sake of transpositions. */
idic_t * dic;
#ifdef STACKALLOCOK
unsigned int matrix[size1][size2];
#else
unsigned int ** matrix;
#endif
/* First handle the two extreme cases of one or the other strings
being empty. */
if (len1 == 0) {
return len2;
}
if (len2 == 0) {
return len1;
}
#ifndef STACKALLOCOK
CALLOC (matrix, size1, unsigned int *);
for (i = 0; i < size1; i++) {
CALLOC (matrix[i], size2, unsigned int);
}
#endif /* STACKALLOCOK */
dic = & tf->idic;
max = tf->max_distance;
large_value = len1 + len2;
/* Initialize the dynamic programming matrix's values. */
matrix[0][0] = large_value;
matrix[1][0] = large_value;
for (j = 0; j <= len2; j++) {
matrix[1][j + 1] = j;
matrix[0][j + 1] = large_value;
}
for (i = 1; i <= len1; i++) {
/* Last matching column. */
int lmc;
/* Minimum value on the row. */
int row_min;
int ic;
ic = word1[i - 1];
matrix[i + 1][1] = i;
matrix[i + 1][0] = large_value;
lmc = 0;
row_min = INT_MAX;
for (j = 1; j <= len2; j++) {
/* Last matching row */
unsigned int lmr;
/* Swap score */
unsigned int ss;
int jc;
jc = word2[j - 1];
/* See if we can find jc somewhere in "word1". */
lmr = idic_find (dic, jc);
if (lmr > 0) {
/* We have found "jc" at some offset into "word1", so
work out the cost of swapping. */
ss = matrix[lmr][lmc] + i + j - 1 - lmr - lmc;
}
else {
/* Have not found "jc" in "word1", so there is no
possibility of transposition, don't bother
calculating any more. See comment about i > 0
below. */
ss = large_value;
}
if (ic != jc) {
int x;
int y;
/* Insertion, deletion, or replacement. */
x = minimum (matrix[i + 1][j], matrix[i][j + 1]);
y = minimum (matrix[i][j], x);
/* Swapping or one of the above. */
matrix[i + 1][j + 1] = minimum (ss, y + 1);
}
else {
/* Exact match, mark this as the last matching
column. */
lmc = j;
/* Copy the character. */
matrix[i + 1][j + 1] = matrix[i][j];
}
row_min = minimum (row_min, matrix[i + 1][j + 1]);
}
if (max > 0 && row_min > max) {
d = max + 1;
goto cleanup;
}
/* Change the location of ic in the dictionary to be "i" since
that is now the most recent example of it. Notice this sets
i > 0 always, even for the first character. */
idic_set (dic, ic, i);
}
d = matrix[len1 + 1][len2 + 1];
cleanup:
#ifndef STACKALLOCOK
for (i = 0; i < size1; i++) {
FREE (matrix[i]);
matrix[i] = 0;
}
FREE (matrix);
#endif /* STACKALLOCOK */
return d;
}
#line 1 "edit-distance-char.c"
#line 2 "edit-distance.c.tmpl"
/* #include <stdio.h> */
/* #include <limits.h> */
/* #include <stdlib.h> */
/* #include <stdint.h> */
/* #include "config.h" */
/* #include "text-fuzzy.h" */
/* #include "edit-distance-char.h" */
static int distance_char (text_fuzzy_t * tf)
{
const unsigned char * word1 = (const unsigned char *) tf->b.text;
int len1 = tf->b.length;
const unsigned char * word2 = (const unsigned char *) tf->text.text;
int len2 = tf->text.length;
/* Return value. */
int d;
/* X and Y coordinates in the matrix of strings. */
int i;
int j;
/* Unfeasible value; indicates no match. */
int large_value;
int max;
/* Matrix is the dynamic programming matrix. We economize on space
by having only two columns. */
#ifdef STACKALLOCOK
int matrix[2][len2 + 1];
#else
int * matrix[2];
#endif
tf->n_edit = 0;
max = tf->max_distance;
#ifndef STACKALLOCOK
for (i = 0; i < 2; i++) {
CALLOC (matrix[i], len2 + 1, int);
}
#endif
/*
Initialize the 0 row of "matrix".
0
1
2
3
*/
if (max != NO_MAX_DISTANCE) {
large_value = max + 1;
}
else {
if (len2 > len1) {
large_value = len2;
}
else {
large_value = len1;
}
}
for (j = 0; j <= len2; j++) {
matrix[0][j] = j;
}
/* Loop over column. */
for (i = 1; i <= len1; i++) {
unsigned char c1;
/* The first value to consider of the ith column. */
int min_j;
/* The last value to consider of the ith column. */
int max_j;
/* The smallest value of the matrix in the ith column. */
int col_min;
/* The next column of the matrix to fill in. */
int next;
/* The previously-filled-in column of the matrix. */
int prev;
c1 = word1[i-1];
min_j = 1;
max_j = len2;
/* If we have a maximum permitted distance, we can set the
minimum and maximum columns to inspect to be smaller than
the smallest and largest values respectively. */
if (max != NO_MAX_DISTANCE) {
if (i > max) {
min_j = i - max;
}
if (len2 > max + i) {
max_j = max + i;
}
}
col_min = INT_MAX;
next = i % 2;
if (next == 1) {
prev = 0;
}
else {
prev = 1;
}
matrix[next][0] = i;
/* Loop over rows. */
for (j = 1; j <= len2; j++) {
if (j < min_j || j > max_j) {
/* Put a large value in there. */
matrix[next][j] = large_value;
}
else {
unsigned char c2;
c2 = word2[j - 1];
if (c1 == c2) {
/* The character at position i in word1 is the same as
the character at position j in word2. */
matrix[next][j] = matrix[prev][j - 1];
}
else {
/* The character at position i in word1 is not the
same as the character at position j in word2, so
work out what the minimum cost for getting to cell
i, j is. */
int delete;
int insert;
int substitute;
int minimum;
delete = matrix[prev][j] + 1;
insert = matrix[next][j-1] + 1;
substitute = matrix[prev][j-1] + 1;
minimum = delete;
if (insert < minimum) {
minimum = insert;
}
if (substitute < minimum) {
minimum = substitute;
}
matrix[next][j] = minimum;
}
}
/* Find the minimum value in the ith column. */
if (matrix[next][j] < col_min) {
col_min = matrix[next][j];
}
}
if (max != NO_MAX_DISTANCE) {
if (col_min > max) {
/* All the elements of the ith column are greater than the
maximum, so no match less than or equal to max can be
found by looking at succeeding columns. */
d = large_value;
goto cleanup;
}
}
}
d = matrix[len1 % 2][len2];
cleanup:
#ifndef STACKALLOCOK
for (i = 0; i < 2; i++) {
FREE (matrix[i]);
}
#endif
return d;
}
#line 1 "edit-distance-int.c"
#line 2 "edit-distance.c.tmpl"
/* #include <stdio.h> */
/* #include <limits.h> */
/* #include <stdlib.h> */
/* #include <stdint.h> */
/* #include "config.h" */
/* #include "text-fuzzy.h" */
/* #include "edit-distance-int.h" */
static int distance_int (text_fuzzy_t * tf)
{
const int * word1 = (const int *) tf->b.unicode;
int len1 = tf->b.ulength;
const int * word2 = (const int *) tf->text.unicode;
int len2 = tf->text.ulength;
/* Return value. */
int d;
/* X and Y coordinates in the matrix of strings. */
int i;
int j;
/* Unfeasible value; indicates no match. */
int large_value;
int max;
/* Matrix is the dynamic programming matrix. We economize on space
by having only two columns. */
#ifdef STACKALLOCOK
int matrix[2][len2 + 1];
#else
int * matrix[2];
#endif
tf->n_edit = 0;
max = tf->max_distance;
#ifndef STACKALLOCOK
for (i = 0; i < 2; i++) {
CALLOC (matrix[i], len2 + 1, int);
}
#endif
/*
Initialize the 0 row of "matrix".
0
1
2
3
*/
if (max != NO_MAX_DISTANCE) {
large_value = max + 1;
}
else {
if (len2 > len1) {
large_value = len2;
}
else {
large_value = len1;
}
}
for (j = 0; j <= len2; j++) {
matrix[0][j] = j;
}
/* Loop over column. */
for (i = 1; i <= len1; i++) {
int c1;
/* The first value to consider of the ith column. */
int min_j;
/* The last value to consider of the ith column. */
int max_j;
/* The smallest value of the matrix in the ith column. */
int col_min;
/* The next column of the matrix to fill in. */
int next;
/* The previously-filled-in column of the matrix. */
int prev;
c1 = word1[i-1];
min_j = 1;
max_j = len2;
/* If we have a maximum permitted distance, we can set the
minimum and maximum columns to inspect to be smaller than
the smallest and largest values respectively. */
if (max != NO_MAX_DISTANCE) {
if (i > max) {
min_j = i - max;
}
if (len2 > max + i) {
max_j = max + i;
}
}
col_min = INT_MAX;
next = i % 2;
if (next == 1) {
prev = 0;
}
else {
prev = 1;
}
matrix[next][0] = i;
/* Loop over rows. */
for (j = 1; j <= len2; j++) {
if (j < min_j || j > max_j) {
/* Put a large value in there. */
matrix[next][j] = large_value;
}
else {
int c2;
c2 = word2[j - 1];
if (c1 == c2) {
/* The character at position i in word1 is the same as
the character at position j in word2. */
matrix[next][j] = matrix[prev][j - 1];
}
else {
/* The character at position i in word1 is not the
same as the character at position j in word2, so
work out what the minimum cost for getting to cell
i, j is. */
int delete;
int insert;
int substitute;
int minimum;
delete = matrix[prev][j] + 1;
insert = matrix[next][j-1] + 1;
substitute = matrix[prev][j-1] + 1;
minimum = delete;
if (insert < minimum) {
minimum = insert;
}
if (substitute < minimum) {
minimum = substitute;
}
matrix[next][j] = minimum;
}
}
/* Find the minimum value in the ith column. */
if (matrix[next][j] < col_min) {
col_min = matrix[next][j];
}
}
if (max != NO_MAX_DISTANCE) {
if (col_min > max) {
/* All the elements of the ith column are greater than the
maximum, so no match less than or equal to max can be
found by looking at succeeding columns. */
d = large_value;
goto cleanup;
}
}
}
d = matrix[len1 % 2][len2];
cleanup:
#ifndef STACKALLOCOK
for (i = 0; i < 2; i++) {
FREE (matrix[i]);
}
#endif
return d;
}
/* The following calculations need to be done twice, first when
creating the alphabet and second when looking up a new character in
it. The macro saves us from exasperating bugs. */
#define BYTE_BIT \
byte = ((c - u->min) / 8) ; \
bit = 1 << (c % 8);
/* Generate the Unicode alphabet in "tf->ualphabet". */
text_fuzzy_status_t text_fuzzy_generate_ualphabet (text_fuzzy_t * tf)
{
int i;
/* "u" is a pointer to the alphabet in "tf". This saves repeatedly
typing "tf->ualphabet". */
ualphabet_t * u;
/* "t" is a pointer to the string in "tf". This saves repeatedly
typing "tf->text". */
text_fuzzy_string_t * t;
/* Check this routine was not called by mistake. */
FAIL (! tf->unicode, ualphabet_on_non_unicode);
u = & tf->ualphabet;
t = & tf->text;
MESSAGE ("Alphabetizing %s\n", t->text);
/* Set the maximum to the smallest possible value and the minimum
to the largest possible value. */
u->min = INT_MAX;
u->max = INT_MIN;
/* Get the minimum and maximum values. */
for (i = 0; i < t->ulength; i++) {
/* Character at position "i". */
int c;
c = t->unicode[i];
if (c > u->max) {
u->max = c;
}
if (c < u->min) {
u->min = c;
}
}
MESSAGE ("Range is %X - %X\n", u->min, u->max);
/* The number of bytes we need to store the alphabet. */
u->size = u->max /8 - u->min / 8 + 1;
if (u->size >= UALPHABET_MAX_SIZE) {
/* Give up trying to make this alphabet. */
OK;
}
/* Create a zeroed alphabet. */
CALLOC (u->alphabet, u->size, unsigned char);
tf->n_mallocs++;
/* Get the minimum and maximum values. */
for (i = 0; i < t->ulength; i++) {
/* Character at position "i". */
int c;
/* Byte and bit offset of c in u->alphabet. */
int byte;
unsigned char bit;
c = t->unicode[i];
FAIL (c > u->max || c < u->min, max_min_miscalculation);
BYTE_BIT;
MESSAGE ("Accepting %X at byte %X, bit %X.\n", c, byte, bit);
FAIL_MSG (byte < 0 || byte >= u->size, max_min_miscalculation,
"The value of byte is %d, not within 0 - %d", byte, u->size);
u->alphabet[byte] |= bit;
}
/* We have succeeded. */
tf->use_ualphabet = 1;
MESSAGE ("Size %d, min %d, max %d\n", u->size, u->min, u->max);
OK;
}
/* This returns a true value if the difference between the alphabet of
"b" and the alphabet of "tf" is greater than the maximum distance
which "tf" will accept. */
static int ualphabet_miss (text_fuzzy_t * tf, text_fuzzy_string_t * b)
{
int i;
/* "u" is a pointer to the alphabet in "tf". This saves repeatedly
typing "tf->ualphabet". */
ualphabet_t * u;
/* The number of misses. */
int misses;
FAIL (tf->max_distance == NO_MAX_DISTANCE, max_distance_misuse);
u = & tf->ualphabet;
misses = 0;
for (i = 0; i < tf->b.ulength; i++) {
int c;
c = tf->b.unicode[i];
MESSAGE ("Looking for %X: ", c);
/* Eliminate too large or too small. */
if (c >= u->min && c <= u->max) {
/* Byte and bit offset of c in u->alphabet. */
int byte;
unsigned char bit;
BYTE_BIT;
MESSAGE (" byte %X, bit %X: ", byte, bit);
/* Exact check against the alphabet of "tf". */
if (! (u->alphabet[byte] & bit)) {
MESSAGE ("not ");
misses++;
}
MESSAGE ("there.\n");
}
else {
misses++;
MESSAGE (" out of bounds.\n");
}
/* If we have too many misses, stop searching. */
if (misses > tf->max_distance) {
MESSAGE ("%s:%s: %d misses over %d: ",
tf->text.text, tf->b.text, misses, tf->max_distance);
return 1;
}
}
return 0;
}
/* This is a value for the edit distance which indicates complete
failure to match. */
#define NOT_FOUND -1
#define LENGTH_REJECT(x,y) \
MESSAGE ("Length of %d can never match %d within %d edits", \
(x), (y), tf->max_distance); \
tf->length_rejections++
/* Compare tf and b. This goes through a series of filters which
reject impossible matches, and then if none of the filters applies,
it uses the dynamic programming algorithm to search. The source
code for the dynamic programming algorithms is in
"edit-distance.c.tmpl". */
text_fuzzy_status_t text_fuzzy_compare_single (text_fuzzy_t * tf)
{
/* The edit distance between "tf->search_term" and the
truncated version of "tf->buf". */
int d;
CALL (resize_edits (tf));
d = NOT_FOUND;
tf->found = 0;
if (tf->unicode) {
if (tf->max_distance != NO_MAX_DISTANCE) {
/* Filter on distance: If the distance in the length of
the strings is greater than the max distance, give up,
since the number of additions necessary to make the
strings identical is greater than the maximum distance
we are allowed. */
if (abs (tf->text.ulength - tf->b.ulength) > tf->max_distance) {
LENGTH_REJECT (tf->b.ulength, tf->text.ulength);
OK;
}
if (tf->use_ualphabet) {
/*
Check that the length of "b" is more than the maximum
distance, otherwise the alphabet check will not reject
"b" regardless of the alphabet difference found, since
the largest possible value of "misses" in the alphabet
check is the total number of characters in "b".
*/
if (tf->b.ulength > tf->max_distance) {
/* Filter using alphabet: If the number of
characters in "b" which are not in "tf->text"
is greater than the maximum distance, give
up. */
if (ualphabet_miss (tf, & tf->b)) {
MESSAGE ("Rejected.\n");
tf->ualphabet.rejections++;
OK;
}
else {
MESSAGE ("Accepted.\n");
}
}
else {
MESSAGE ("%s: skipping alphabet check because len %d <= max %d.\n",
tf->b.text, tf->b.ulength, tf->max_distance);
}
}
}
/* Calculate edit distances using the dynamic programming
algorithm for the integer Unicode strings. */
if (tf->transpositions_ok) {
MESSAGE ("Transpositions OK.\n");
if (! tf->idic_ok) {
text_fuzzy_string_t * t = & tf->text;
idic_init_dic (& tf->idic, t->unicode, t->ulength);
tf->n_mallocs++;
tf->idic_ok = 1;
}
idic_reset (& tf->idic);
d = distance_int_trans (tf);
}
else {
MESSAGE ("No transpositions.\n");
d = distance_int (tf);
}
}
else {
/* This is not Unicode. */
if (tf->max_distance != NO_MAX_DISTANCE) {
/* If the distance in the length of the strings is greater
than the max distance, give up. */
if (abs (tf->text.length - tf->b.length) > tf->max_distance) {
LENGTH_REJECT (tf->b.length, tf->text.length);
OK;
}
/* See comment in the Unicode version, above. */
if (tf->b.length > tf->max_distance) {
/* Alphabet filter: eliminate terms which cannot match. */
if (tf->use_alphabet) {
int alphabet_misses;
int l;
alphabet_misses = 0;
for (l = 0; l < tf->b.length; l++) {
int a = (unsigned char) tf->b.text[l];
if (! tf->alphabet[a]) {
alphabet_misses++;
if (alphabet_misses > tf->max_distance) {
/* It is not possible that the two words
are within the maximum edit distance of
each other. */
tf->alphabet_rejections++;
OK;
}
}
}
}
}
}
/* Calculate the edit distance using the dynamic programming
algorithm for "unsigned char". */
if (tf->transpositions_ok) {
adic_reset_dic (& tf->adic);
d = distance_char_trans (tf);
}
else {
d = distance_char (tf);
}
}
/* If we have found something, and either it is less than or equal
to the maximum distance allowed, or we are not checking for
maximum distance, then record this distance and switch on the
"found" flag, "tf->found". */
if (d != NOT_FOUND && (tf->max_distance == NO_MAX_DISTANCE ||
d <= tf->max_distance)) {
if (tf->no_exact) {
/* Skip exact matches. */
if (d == 0) {
OK;
}
}
tf->found = 1;
tf->distance = d;
if (tf->scanning) {
tf->max_distance = tf->distance;
}
if (tf->wantarray) {
candidate_t * c;
CALLOC (c, 1, candidate_t);
tf->n_mallocs += 1;
c->distance = d;
c->offset = tf->offset;
c->next = 0;
tf->last->next = c;
tf->last = c;
}
}
OK;
}
text_fuzzy_status_t text_fuzzy_get_candidates (text_fuzzy_t * text_fuzzy,
int * n_candidates_ptr,
int ** candidates_ptr)
{
candidate_t * c;
candidate_t * last;
int n_candidates = 0;
int * candidates;
int i;
last = text_fuzzy->first.next;
while (last) {
c = last;
last = last->next;
if (c->distance == text_fuzzy->distance) {
n_candidates++;
}
}
if (n_candidates == 0) {
* n_candidates_ptr = 0;
* candidates_ptr = 0;
OK;
}
CALLOC (candidates, n_candidates, int);
text_fuzzy->n_mallocs++;
last = text_fuzzy->first.next;
i = 0;
while (last) {
c = last;
/* Set "last" to the next one here so that we do not
access freed memory. */
last = last->next;
/* Some of the entries might be things which had a lower
distance initially, but then were beaten by later
entries, so here we check that the entry actually does
have the lowest distance, and only if so do we keep
it. */
if (c->distance == text_fuzzy->distance) {
candidates[i] = c->offset;
i++;
}
FREE (c);
text_fuzzy->n_mallocs--;
}
FAIL_MSG (i != n_candidates, miscount,
"Wrong number of entries %d should be %d", i, n_candidates);
* candidates_ptr = candidates;
* n_candidates_ptr = n_candidates;
OK;
}
text_fuzzy_status_t text_fuzzy_free_candidates (text_fuzzy_t * text_fuzzy, int * candidates)
{
if (candidates) {
FREE (candidates);
text_fuzzy->n_mallocs--;
}
OK;
}
/* This is the threshold above which we do not bother computing the
alphabet of the string. If it has more than this number of unique
characters, the alphabet will not reduce the search time by
much. */
static int max_unique_characters = 45;
/* Generate an alphabet from the search word, which is used to filter
non-matching terms without using the dynamic programming
algorithm. */
text_fuzzy_status_t text_fuzzy_generate_alphabet (text_fuzzy_t * text_fuzzy)
{
int unique_characters;
int i;
text_fuzzy->use_alphabet = 1;
for (i = 0; i < 0x100; i++) {
text_fuzzy->alphabet[i] = 0;
}
unique_characters = 0;
for (i = 0; i < text_fuzzy->text.length; i++) {
int c;
c = (unsigned char) text_fuzzy->text.text[i];
if (! text_fuzzy->alphabet[c]) {
unique_characters++;
text_fuzzy->alphabet[c] = 1;
}
}
if (unique_characters > max_unique_characters) {
text_fuzzy->use_alphabet = 0;
}
/* Find an unused slot. This is for the case where the string to
match is not in Unicode, but the string which it is matched
against is in Unicode. */
for (i = 1; i < 0x100; i++) {
if (text_fuzzy->alphabet[i] == 0) {
text_fuzzy->invalid_char = i;
break;
}
}
OK;
}
text_fuzzy_status_t text_fuzzy_begin_scanning (text_fuzzy_t * text_fuzzy)
{
/* Even if the user does not want to set a maximum distance, set
one anyway so that we can reject stuff without going into the
dynamic programming algorithm. Keep the user's value in
"text_fuzzy->max_distance_holder". */
text_fuzzy->max_distance_holder = text_fuzzy->max_distance;
if (text_fuzzy->max_distance == NO_MAX_DISTANCE) {
/* Use INT_MAX / 2 here because INT_MAX + 1 = INT_MIN, causing
hard-to-find bugs. */
text_fuzzy->max_distance = INT_MAX / 2;
}
text_fuzzy->scanning = 1;
/* Set per-scan variables. */
text_fuzzy->distance = -1;
text_fuzzy->ualphabet.rejections = 0;
text_fuzzy->alphabet_rejections = 0;
text_fuzzy->length_rejections = 0;
/* Set up the linked list. */
if (text_fuzzy->wantarray) {
text_fuzzy->last = & text_fuzzy->first;
}
OK;
}
/* Put the user's desired maximum distance back into the object for
the next search. */
text_fuzzy_status_t text_fuzzy_end_scanning (text_fuzzy_t * text_fuzzy)
{
text_fuzzy->max_distance = text_fuzzy->max_distance_holder;
text_fuzzy->scanning = 0;
OK;
}
/* __ _ _ __ _ _
/ _(_) | ___ / _|_ _ _ __ ___| |_(_) ___ _ __ ___
| |_| | |/ _ \ | |_| | | | '_ \ / __| __| |/ _ \| '_ \/ __|
| _| | | __/ | _| |_| | | | | (__| |_| | (_) | | | \__ \
|_| |_|_|\___| |_| \__,_|_| |_|\___|\__|_|\___/|_| |_|___/ */
#define BUF_SIZE 0x1000
typedef struct fuzzy_file
{
const char * file_name;
FILE * fh;
char buf[BUF_SIZE];
char * line;
int length;
text_fuzzy_string_t b;
int remaining;
int offset;
int eof : 1;
}
fuzzy_file_t;
#define SIZE 0x1000
static text_fuzzy_status_t text_fuzzy_more_bytes (fuzzy_file_t * ff)
{
int bytes;
bytes = fread (ff->buf, sizeof (char), SIZE, ff->fh);
if (bytes != SIZE) {
if (feof (ff->fh)) {
ff->eof = 1;
}
else {
FAIL (bytes != SIZE, read_error);
}
}
ff->remaining = bytes;
ff->offset = 0;
OK;
}
static text_fuzzy_status_t text_fuzzy_get_line (fuzzy_file_t * ff)
{
int i;
static char s[SIZE];
i = 0;
while (1) {
char c;
if (! ff->remaining) {
CALL (more_bytes (ff));
}
c = ff->buf[ff->offset];
ff->offset++;
ff->remaining--;
if (c == '\n' || (ff->remaining == 0 && ff->eof)) {
s[i] = '\0';
break;
}
else {
s[i] = c;
}
i++;
FAIL (i >= SIZE, line_too_long);
}
ff->b.text = s;
ff->b.length = i;
OK;
}
static text_fuzzy_status_t text_fuzzy_open (fuzzy_file_t * ff, const char * file_name)
{
ff->file_name = file_name;
ff->fh = fopen (ff->file_name, "r");
FAIL_MSG (! ff->fh, open_error, "failed to open %s: %s", ff->file_name,
strerror (errno));
OK;
}
static text_fuzzy_status_t text_fuzzy_close (fuzzy_file_t * ff)
{
FAIL (fclose (ff->fh), close_error);
OK;
}
/* Scan the file specified by "file_name" for our string. The nearest
string found is returned in "* nearest_ptr". */
text_fuzzy_status_t text_fuzzy_scan_file (text_fuzzy_t * text_fuzzy, char * file_name,
char ** nearest_ptr, int * nearest_length_ptr)
{
fuzzy_file_t ff = {0};
char * nearest;
int found;
CALL (open (& ff, file_name));
CALL (begin_scanning (text_fuzzy));
found = 0;
nearest = 0;
while (1) {
CALL (get_line (& ff));
text_fuzzy->b = ff.b;
CALL (compare_single (text_fuzzy));
if (text_fuzzy->found) {
found = 1;
if (! nearest) {
CALLOC (nearest, ff.b.length + 1, char);
}
else {
REALLOC (nearest, ff.b.length + 1, char);
}
strncpy (nearest, ff.b.text, ff.b.length);
nearest[ff.b.length] = '\0';
}
if (ff.eof && ff.remaining == 0) {
break;
}
}
CALL (close (& ff));
CALL (end_scanning (text_fuzzy));
if (found) {
* nearest_ptr = nearest;
}
else {
* nearest_ptr = 0;
}
OK;
}
text_fuzzy_status_t text_fuzzy_scan_file_free (char * nearest)
{
FREE (nearest);
OK;
}
text_fuzzy_status_t text_fuzzy_alphabet_rejections (text_fuzzy_t * text_fuzzy, int * r)
{
* r = text_fuzzy->alphabet_rejections;
OK;
}
/* Free non-Perl malloc memory using the C library "free". This is all
about "free to wrong pool". See
"http://www.perlmonks.org/?node_id=742205" */
text_fuzzy_status_t text_fuzzy_free_memory (text_fuzzy_t * text_fuzzy)
{
if (text_fuzzy->ualphabet.alphabet) {
FREE (text_fuzzy->ualphabet.alphabet);
text_fuzzy->n_mallocs--;
}
if (text_fuzzy->idic_ok) {
idic_free_dic (& text_fuzzy->idic);
text_fuzzy->idic_ok = 0;
text_fuzzy->n_mallocs--;
}
if (text_fuzzy->edits) {
FREE (text_fuzzy->edits);
text_fuzzy->edits = 0;
text_fuzzy->edits_size = 0;
text_fuzzy->n_mallocs--;
}
OK;
}
text_fuzzy_status_t text_fuzzy_set_max_distance (text_fuzzy_t * text_fuzzy, int max_distance)
{
text_fuzzy->max_distance = max_distance;
OK;
}
text_fuzzy_status_t text_fuzzy_get_max_distance (text_fuzzy_t * text_fuzzy, int * max_distance)
{
* max_distance = text_fuzzy->max_distance;
OK;
}
text_fuzzy_status_t text_fuzzy_set_transpositions (text_fuzzy_t * text_fuzzy, int transpositions)
{
text_fuzzy->transpositions_ok = transpositions != 0 ? 1 : 0;
OK;
}
text_fuzzy_status_t text_fuzzy_get_transpositions (text_fuzzy_t * text_fuzzy, int * transpositions)
{
* transpositions = text_fuzzy->transpositions_ok;
OK;
}
text_fuzzy_status_t text_fuzzy_last_distance (text_fuzzy_t * text_fuzzy, int * last_distance)
{
* last_distance = text_fuzzy->distance;
OK;
}
text_fuzzy_status_t text_fuzzy_no_alphabet (text_fuzzy_t * text_fuzzy, int yes_no)
{
text_fuzzy->user_no_alphabet = yes_no != 0 ? 1 : 0;
if (text_fuzzy->user_no_alphabet) {
text_fuzzy->use_alphabet = 0;
text_fuzzy->use_ualphabet = 0;
}
OK;
}
text_fuzzy_status_t text_fuzzy_ualphabet_rejections (text_fuzzy_t * text_fuzzy, int * ualphabet_rejections)
{
* ualphabet_rejections = text_fuzzy->ualphabet.rejections;
OK;
}
text_fuzzy_status_t text_fuzzy_set_no_exact (text_fuzzy_t * text_fuzzy, int yes_no)
{
text_fuzzy->no_exact = yes_no != 0 ? 1 : 0;
OK;
}
text_fuzzy_status_t text_fuzzy_get_length_rejections (text_fuzzy_t * text_fuzzy, int * length_rejections)
{
* length_rejections = text_fuzzy->length_rejections;
OK;
}
text_fuzzy_status_t text_fuzzy_get_unicode_length (text_fuzzy_t * text_fuzzy, int * unicode_length)
{
if (text_fuzzy->text.unicode) {
* unicode_length = text_fuzzy->text.ulength;
}
else {
* unicode_length = TEXT_FUZZY_INVALID_UNICODE_LENGTH;
}
OK;
}
/* statuses:
status: open_error
status: close_error
status: read_error
status: line_too_long
status: ualphabet_on_non_unicode
%%description:
There was an attempt to make a Unicode alphabet on a non-Unicode string.
%%
status: max_min_miscalculation
status: string_too_long
%%description:
A string for comparison was larger than the value of HUGE defined in the code.
%%
status: max_distance_misuse
%%description:
An attempt was made to use the maximum edit distance which was unset.
%%
status: miscount
*/