/* Fast fuzzy searching among messages.
Copyright (C) 2006 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2006.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
/* Specification. */
#include "msgl-fsearch.h"
#include <math.h>
#include <stdlib.h>
#include "xalloc.h"
#include "po-charset.h"
/* Fuzzy searching of L strings in a large set of N messages (assuming
they have all the same small size) takes O(L * N) when using repeated
fstrcmp() calls. When for example L = 800 and N = 69000, this is slow.
So we preprocess the set of N messages, yielding a data structure
that allows to select the similar messages for any given string, and
apply fstrcmp() only to the search result. This allows to reduce the
time to something between O(L * 1) and O(L * N) - depending on the
structure of the strings. In the example with L = 800 and N = 69000,
memory use increases by a factor of 2 and the time decreases from
9068 s to 230 s.
The data structure is a hash table mapping each n-gram (here with n=4)
occurring in the message strings to the list of messages that contains
it. For example, if the message list is
[0] "close"
[1] "losers"
the index is a hash table mapping
"clos" -> { 0 }
"lose" -> { 0, 1 }
"oser" -> { 1 }
"sers" -> { 1 }
Searching the similar messages to, say, "closest", is done by looking up
all its 4-grams in the hash table and summing up the results:
"clos" -> { 0 }
"lose" -> { 0, 1 }
"oses" -> {}
"sest" -> {}
=> total: { 2x0, 1x1 }
The list is sorted according to decreasing frequency: { 0, 1, ... }
and only the first few messages from this frequency list are passed to
fstrcmp().
The n-gram similarity and the fstrcmp() similarity are quite different
metrics. For example, "close" and "c l o s e" have no n-grams in common
(even for n as small as n = 2); however, fstrcmp() will find that they
are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and
"ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
only 0.02. Also, repeated n-grams don't have the same effect on the
two similarity measures. But all this doesn't matter much in practice.
We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
lists are likely too long. (Well, with ideographic languages like Chinese,
n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case
either.)
The units are characters in the current encoding. Not just bytes,
because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */
/* Each message is represented by its index in the message list. */
typedef unsigned int index_ty;
/* An index list has its allocated size and length tacked at the front.
The indices are sorted in ascending order. */
typedef index_ty *index_list_ty;
#define IL_ALLOCATED 0
#define IL_LENGTH 1
/* Create a new index list containing only a given index. */
static inline index_list_ty
new_index (index_ty idx)
{
index_ty *list = (index_ty *) xmalloc ((2 + 1) * sizeof (index_ty));
list[IL_ALLOCATED] = 1;
list[IL_LENGTH] = 1;
list[2] = idx;
return list;
}
/* Add a given index, greater or equal than all previous indices, to an
index list.
Return the new index list, if it had to be reallocated, or NULL if it
didn't change. */
static inline index_list_ty
addlast_index (index_list_ty list, index_ty idx)
{
index_list_ty result;
size_t length = list[IL_LENGTH];
/* Look whether it should be inserted. */
if (length > 0 && list[2 + (length - 1)] == idx)
return NULL;
/* Now make room for one more list element. */
result = NULL;
if (length == list[IL_ALLOCATED])
{
size_t new_allocated = 2 * length - (length >> 6);
list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
list[IL_ALLOCATED] = new_allocated;
result = list;
}
list[2 + length] = idx;
list[IL_LENGTH] = length + 1;
return result;
}
/* Add a given index to an index list.
Return the new index list, if it had to be reallocated, or NULL if it
didn't change. */
static inline index_list_ty
add_index (index_list_ty list, index_ty idx)
{
index_list_ty result;
size_t length = list[IL_LENGTH];
/* Look where it should be inserted. */
size_t lo = 0;
size_t hi = length;
while (lo < hi)
{
/* Here we know that idx must be inserted at a position >= lo, <= hi. */
size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
index_ty val = list[2 + mid];
if (val < idx)
lo = mid + 1;
else if (val > idx)
hi = mid;
else
return NULL;
}
/* Now make room for one more list element. */
result = NULL;
if (length == list[IL_ALLOCATED])
{
size_t new_allocated = 2 * length - (length >> 6);
list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
list[IL_ALLOCATED] = new_allocated;
result = list;
}
list[IL_LENGTH] = length + 1;
for (; length > hi; length--)
list[2 + length] = list[1 + length];
list[2 + length] = idx;
return result;
}
/* We use 4-grams, therefore strings with less than 4 characters cannot be
handled through the 4-grams table and need to be handled specially.
Since every character occupies at most 4 bytes (see po-charset.c),
this means the size of such short strings is bounded by: */
#define SHORT_STRING_MAX_CHARACTERS (4 - 1)
#define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
/* Such short strings are handled by direct comparison with all messages
of appropriate size. Note that for two strings of length 0 <= l1 <= l2,
fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in
fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
limit the search to lengths l' in the range
l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
Thus we need the list of the short strings up to length: */
#define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 * FUZZY_THRESHOLD_INV - 1))
/* A fuzzy index contains a hash table mapping all n-grams to their
occurrences list. */
struct message_fuzzy_index_ty
{
message_ty **messages;
character_iterator_t iterator;
hash_table gram4;
size_t firstfew;
message_list_ty *short_messages[SHORT_MSG_MAX + 1];
};
/* Allocate a fuzzy index corresponding to a given list of messages.
The list of messages and the msgctxt and msgid fields of the messages
inside it must not be modified while the returned fuzzy index is in use. */
message_fuzzy_index_ty *
message_fuzzy_index_alloc (const message_list_ty *mlp,
const char *canon_charset)
{
message_fuzzy_index_ty *findex =
(message_fuzzy_index_ty *) xmalloc (sizeof (message_fuzzy_index_ty));
size_t count = mlp->nitems;
size_t j;
size_t l;
findex->messages = mlp->item;
findex->iterator = po_charset_character_iterator (canon_charset);
/* Setup hash table. */
if (hash_init (&findex->gram4, 10 * count) < 0)
xalloc_die ();
for (j = 0; j < count; j++)
{
message_ty *mp = mlp->item[j];
if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
{
const char *str = mp->msgid;
/* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
const char *p0 = str;
if (*p0 != '\0')
{
const char *p1 = p0 + findex->iterator (p0);
if (*p1 != '\0')
{
const char *p2 = p1 + findex->iterator (p1);
if (*p2 != '\0')
{
const char *p3 = p2 + findex->iterator (p2);
if (*p3 != '\0')
{
const char *p4 = p3 + findex->iterator (p3);
for (;;)
{
/* The segment from p0 to p4 is a 4-gram of
characters. Add a hash table entry that maps
it to the index j, or extend the existing
hash table entry accordingly. */
void *found;
if (hash_find_entry (&findex->gram4, p0, p4 - p0,
&found) == 0)
{
index_list_ty list = (index_list_ty) found;
list = addlast_index (list, j);
if (list != NULL)
hash_set_value (&findex->gram4, p0, p4 - p0,
list);
}
else
hash_insert_entry (&findex->gram4, p0, p4 - p0,
new_index (j));
/* Advance. */
if (*p4 == '\0')
break;
p0 = p1;
p1 = p2;
p2 = p3;
p3 = p4;
p4 = p4 + findex->iterator (p4);
}
}
}
}
}
}
}
/* Shrink memory used by the hash table. */
{
void *iter;
const void *key;
size_t keylen;
void **valuep;
iter = NULL;
while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
== 0)
{
index_list_ty list = (index_list_ty) *valuep;
index_ty length = list[IL_LENGTH];
if (length < list[IL_ALLOCATED])
{
list[IL_ALLOCATED] = length;
*valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
}
}
}
findex->firstfew = (int) sqrt ((double) count);
if (findex->firstfew < 10)
findex->firstfew = 10;
/* Setup lists of short messages. */
for (l = 0; l <= SHORT_MSG_MAX; l++)
findex->short_messages[l] = message_list_alloc (false);
for (j = 0; j < count; j++)
{
message_ty *mp = mlp->item[j];
if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
{
const char *str = mp->msgid;
size_t len = strlen (str);
if (len <= SHORT_MSG_MAX)
message_list_append (findex->short_messages[len], mp);
}
}
/* Shrink memory used by the lists of short messages. */
for (l = 0; l <= SHORT_MSG_MAX; l++)
{
message_list_ty *mlp = findex->short_messages[l];
if (mlp->nitems < mlp->nitems_max)
{
mlp->nitems_max = mlp->nitems;
mlp->item =
(message_ty **)
xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
}
}
return findex;
}
/* An index with multiplicity. */
struct mult_index
{
index_ty index;
unsigned int count;
};
/* A list of indices with multiplicity.
The indices are sorted in ascending order. */
struct mult_index_list
{
struct mult_index *item;
size_t nitems;
size_t nitems_max;
/* Work area. */
struct mult_index *item2;
size_t nitems2_max;
};
/* Initialize an empty list of indices with multiplicity. */
static inline void
mult_index_list_init (struct mult_index_list *accu)
{
accu->nitems = 0;
accu->nitems_max = 0;
accu->item = NULL;
accu->nitems2_max = 0;
accu->item2 = NULL;
}
/* Add an index list to a list of indices with multiplicity. */
static inline void
mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
{
size_t len1 = accu->nitems;
size_t len2 = list[IL_LENGTH];
size_t need = len1 + len2;
struct mult_index *ptr1;
struct mult_index *ptr1_end;
index_ty *ptr2;
index_ty *ptr2_end;
struct mult_index *destptr;
/* Make the work area large enough. */
if (accu->nitems2_max < need)
{
size_t new_max = 2 * accu->nitems2_max + 1;
if (new_max < need)
new_max = need;
if (accu->item2 != NULL)
free (accu->item2);
accu->item2 =
(struct mult_index *) xmalloc (new_max * sizeof (struct mult_index));
accu->nitems2_max = new_max;
}
/* Make a linear pass through accu and list simultaneously. */
ptr1 = accu->item;
ptr1_end = ptr1 + len1;
ptr2 = list + 2;
ptr2_end = ptr2 + len2;
destptr = accu->item2;
while (ptr1 < ptr1_end && ptr2 < ptr2_end)
{
if (ptr1->index < *ptr2)
{
*destptr = *ptr1;
ptr1++;
}
else if (ptr1->index > *ptr2)
{
destptr->index = *ptr2;
destptr->count = 1;
ptr2++;
}
else /* ptr1->index == list[2 + i2] */
{
destptr->index = ptr1->index;
destptr->count = ptr1->count + 1;
ptr1++;
ptr2++;
}
destptr++;
}
while (ptr1 < ptr1_end)
{
*destptr = *ptr1;
ptr1++;
destptr++;
}
while (ptr2 < ptr2_end)
{
destptr->index = *ptr2;
destptr->count = 1;
ptr2++;
destptr++;
}
/* Swap accu->item and accu->item2. */
{
struct mult_index *dest = accu->item2;
size_t dest_max = accu->nitems2_max;
accu->item2 = accu->item;
accu->nitems2_max = accu->nitems_max;
accu->item = dest;
accu->nitems = destptr - dest;
accu->nitems_max = dest_max;
}
}
/* Compares two indices with multiplicity, according to their multiplicity. */
static int
mult_index_compare (const void *p1, const void *p2)
{
const struct mult_index *ptr1 = (const struct mult_index *) p1;
const struct mult_index *ptr2 = (const struct mult_index *) p2;
if (ptr1->count < ptr2->count)
return 1;
if (ptr1->count > ptr2->count)
return -1;
/* For reproduceable results, also take into account the indices. */
if (ptr1->index > ptr2->index)
return 1;
if (ptr1->index < ptr2->index)
return -1;
return 0;
}
/* Sort a list of indices with multiplicity according to decreasing
multiplicity. */
static inline void
mult_index_list_sort (struct mult_index_list *accu)
{
if (accu->nitems > 1)
qsort (accu->item, accu->nitems, sizeof (struct mult_index),
mult_index_compare);
}
/* Frees a list of indices with multiplicity. */
static inline void
mult_index_list_free (struct mult_index_list *accu)
{
if (accu->item != NULL)
free (accu->item);
if (accu->item2 != NULL)
free (accu->item2);
}
/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
The match does not need to be optimal. */
message_ty *
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
const char *msgctxt, const char *msgid)
{
const char *str = msgid;
/* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
const char *p0 = str;
if (*p0 != '\0')
{
const char *p1 = p0 + findex->iterator (p0);
if (*p1 != '\0')
{
const char *p2 = p1 + findex->iterator (p1);
if (*p2 != '\0')
{
const char *p3 = p2 + findex->iterator (p2);
if (*p3 != '\0')
{
const char *p4 = p3 + findex->iterator (p3);
struct mult_index_list accu;
mult_index_list_init (&accu);
for (;;)
{
/* The segment from p0 to p4 is a 4-gram of
characters. Get the hash table entry containing
a list of indices, and add it to the accu. */
void *found;
if (hash_find_entry (&findex->gram4, p0, p4 - p0,
&found) == 0)
{
index_list_ty list = (index_list_ty) found;
mult_index_list_accumulate (&accu, list);
}
/* Advance. */
if (*p4 == '\0')
break;
p0 = p1;
p1 = p2;
p2 = p3;
p3 = p4;
p4 = p4 + findex->iterator (p4);
}
/* Sort in decreasing count order. */
mult_index_list_sort (&accu);
/* Take the first few messages from this sorted list, and
maximize the fstrcmp() result. */
{
size_t count;
struct mult_index *ptr;
message_ty *best_mp;
double best_weight;
count = findex->firstfew;
if (count > accu.nitems)
count = accu.nitems;
best_weight = FUZZY_THRESHOLD;
best_mp = NULL;
for (ptr = accu.item; count > 0; ptr++, count--)
{
message_ty *mp = findex->messages[ptr->index];
double weight =
fuzzy_search_goal_function (mp, msgctxt, msgid);
if (weight > best_weight)
{
best_weight = weight;
best_mp = mp;
}
}
mult_index_list_free (&accu);
return best_mp;
}
}
}
}
}
/* The string had less than 4 characters. */
{
size_t l = strlen (str);
size_t lmin, lmax;
message_ty *best_mp;
double best_weight;
if (!(l <= SHORT_STRING_MAX_BYTES))
abort ();
/* Walk through those short messages which have an appropriate length.
See the comment before SHORT_MSG_MAX. */
lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
if (!(lmax <= SHORT_MSG_MAX))
abort ();
best_weight = FUZZY_THRESHOLD;
best_mp = NULL;
for (l = lmin; l <= lmax; l++)
{
message_list_ty *mlp = findex->short_messages[l];
size_t j;
for (j = 0; j < mlp->nitems; j++)
{
message_ty *mp = mlp->item[j];
double weight = fuzzy_search_goal_function (mp, msgctxt, msgid);
if (weight > best_weight)
{
best_weight = weight;
best_mp = mp;
}
}
}
return best_mp;
}
}
/* Free a fuzzy index. */
void
message_fuzzy_index_free (message_fuzzy_index_ty *findex)
{
size_t l;
void *iter;
const void *key;
size_t keylen;
void *data;
/* Free the short lists. */
for (l = 0; l <= SHORT_MSG_MAX; l++)
message_list_free (findex->short_messages[l], 1);
/* Free the index lists occurring as values in the hash tables. */
iter = NULL;
while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
free ((index_list_ty *) data);
/* Free the hash table itself. */
hash_destroy (&findex->gram4);
free (findex);
}