/*
* Stack-less Just-In-Time compiler
*
* Copyright Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are
* permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice, this list
* of conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
* SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "sljitLir.h"
#include "regexJIT.h"
#include <stdlib.h>
#ifdef REGEX_MATCH_VERBOSE
#include <stdio.h>
#endif
/* Extra, hidden flags:
{id!} where id > 0 found in the code. */
#define REGEX_ID_CHECK 0x100
/* When REGEX_NEWLINE && REGEX_MATCH_BEGIN defined, the pattern turn to a normal search,
which starts with [\r\n] character range. */
#define REGEX_FAKE_MATCH_BEGIN 0x200
/* When REGEX_NEWLINE && REGEX_MATCH_END defined, the pattern turn to a normal search,
which ends with [\r\n] character range. */
#define REGEX_FAKE_MATCH_END 0x400
/* Check match completition after every (FINISH_TEST + 1) steps. */
#define FINISH_TEST 0x7
/* --------------------------------------------------------------------- */
/* Structures for JIT-ed pattern matching */
/* --------------------------------------------------------------------- */
struct regex_machine
{
/* flags. */
int flags;
/* Number of state descriptors for one term. */
sljit_sw no_states;
/* Total size. */
sljit_sw size;
union {
void *init_match;
sljit_sw (SLJIT_CALL *call_init)(void *next, void* match);
} u;
#if (defined SLJIT_INDIRECT_CALL && SLJIT_INDIRECT_CALL)
struct sljit_function_context context;
#endif
void *continue_match;
/* Variable sized array to contain the handler addresses. */
sljit_uw entry_addrs[1];
};
struct regex_match
{
/* Current and next state array. */
sljit_sw *current;
sljit_sw *next;
/* Starting. */
sljit_sw head;
/* String character index (ever increasing). */
sljit_sw index;
/* Best match found so far (members in priority order). */
sljit_sw best_begin;
sljit_sw best_end;
sljit_sw best_id;
/* Bool flags (encoded as word). */
sljit_sw fast_quit;
sljit_sw fast_forward;
/* Machine. */
struct regex_machine *machine;
union {
void *continue_match;
void (SLJIT_CALL *call_continue)(struct regex_match *match, const regex_char_t *input_string, int length);
} u;
/* Variable sized array to contain the state arrays. */
sljit_sw states[1];
};
/* State vector
ITEM[0] - pointer to the address inside the machine code
ITEM[1] - next pointer
ITEM[2] - string started from (optional)
ITEM[3] - max ID (optional) */
/* Register allocation. */
/* Current state array (loaded & stored: regex_match->current). */
#define R_CURR_STATE SLJIT_S0
/* Next state array (loaded & stored: regex_match->next). */
#define R_NEXT_STATE SLJIT_S1
/* Head (loaded & stored: regex_match->head). */
#define R_NEXT_HEAD SLJIT_S2
/* String fragment pointer. */
#define R_STRING SLJIT_S3
/* String fragment length. */
#define R_LENGTH SLJIT_S4
/* 'struct regex_match*' */
#define R_REGEX_MATCH SLJIT_R0
/* Current character. */
#define R_CURR_CHAR SLJIT_R1
/* Temporary register. */
#define R_TEMP SLJIT_R2
/* Caches the regex_match->best_begin. */
#define R_BEST_BEGIN SLJIT_R3
/* Current character index. */
#define R_CURR_INDEX SLJIT_R4
/* --------------------------------------------------------------------- */
/* Stack management */
/* --------------------------------------------------------------------- */
/* Try to allocate 2^n blocks. */
#define STACK_FRAGMENT_SIZE (((64 * sizeof(struct stack_item)) - (sizeof(struct stack_fragment_data))) / (sizeof(struct stack_item)))
struct stack_item {
int type;
int value;
};
struct stack_fragment_data {
struct stack_fragment *next;
struct stack_fragment *prev;
};
struct stack_fragment {
struct stack_fragment_data data;
struct stack_item items[STACK_FRAGMENT_SIZE];
};
struct stack {
struct stack_fragment *first;
struct stack_fragment *last;
int index;
int count;
};
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
static void stack_check(struct stack *stack)
{
struct stack_fragment *curr;
int found;
if (!stack)
return;
SLJIT_ASSERT(stack->index >= 0 && stack->index < STACK_FRAGMENT_SIZE);
if (stack->first == NULL) {
SLJIT_ASSERT(stack->first == NULL && stack->last == NULL);
SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
return;
}
found = 0;
if (stack->last == NULL) {
SLJIT_ASSERT(stack->index == STACK_FRAGMENT_SIZE - 1 && stack->count == 0);
found = 1;
}
else
SLJIT_ASSERT(stack->index >= 0 && stack->count >= 0);
SLJIT_ASSERT(stack->first->data.prev == NULL);
curr = stack->first;
while (curr) {
if (curr == stack->last)
found = 1;
if (curr->data.next)
SLJIT_ASSERT(curr->data.next->data.prev == curr);
curr = curr->data.next;
}
SLJIT_ASSERT(found);
}
#endif
static void stack_init(struct stack *stack)
{
stack->first = NULL;
stack->last = NULL;
stack->index = STACK_FRAGMENT_SIZE - 1;
stack->count = 0;
}
static void stack_destroy(struct stack *stack)
{
struct stack_fragment *curr = stack->first;
struct stack_fragment *prev;
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
stack_check(stack);
#endif
while (curr) {
prev = curr;
curr = curr->data.next;
SLJIT_FREE(prev, NULL);
}
}
static SLJIT_INLINE struct stack_item* stack_top(struct stack *stack)
{
SLJIT_ASSERT(stack->last);
return stack->last->items + stack->index;
}
static int stack_push(struct stack *stack, int type, int value)
{
if (stack->last) {
stack->index++;
if (stack->index >= STACK_FRAGMENT_SIZE) {
stack->index = 0;
if (!stack->last->data.next) {
stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
if (!stack->last->data.next)
return 1;
stack->last->data.next->data.next = NULL;
stack->last->data.next->data.prev = stack->last;
}
stack->last = stack->last->data.next;
}
}
else if (!stack->first) {
stack->last = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
if (!stack->last)
return 1;
stack->last->data.prev = NULL;
stack->last->data.next = NULL;
stack->first = stack->last;
stack->index = 0;
}
else {
stack->last = stack->first;
stack->index = 0;
}
stack->last->items[stack->index].type = type;
stack->last->items[stack->index].value = value;
stack->count++;
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
stack_check(stack);
#endif
return 0;
}
static struct stack_item* stack_pop(struct stack *stack)
{
struct stack_item *ret = stack_top(stack);
if (stack->index > 0)
stack->index--;
else {
stack->last = stack->last->data.prev;
stack->index = STACK_FRAGMENT_SIZE - 1;
}
stack->count--;
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
stack_check(stack);
#endif
return ret;
}
static SLJIT_INLINE void stack_clone(struct stack *src, struct stack *dst)
{
*dst = *src;
}
static int stack_push_copy(struct stack *stack, int items, int length)
{
struct stack_fragment *frag1;
int ind1;
struct stack_fragment *frag2;
int ind2;
int counter;
SLJIT_ASSERT(stack->count >= length && items <= length && items > 0);
/* Allocate the necessary elements. */
counter = items;
frag1 = stack->last;
ind1 = stack->index;
while (counter > 0) {
if (stack->index + counter >= STACK_FRAGMENT_SIZE) {
counter -= STACK_FRAGMENT_SIZE - stack->index - 1 + 1;
stack->index = 0;
if (!stack->last->data.next) {
stack->last->data.next = (struct stack_fragment*)SLJIT_MALLOC(sizeof(struct stack_fragment), NULL);
if (!stack->last->data.next)
return 1;
stack->last->data.next->data.next = NULL;
stack->last->data.next->data.prev = stack->last;
}
stack->last = stack->last->data.next;
}
else {
stack->index += counter;
counter = 0;
}
}
frag2 = stack->last;
ind2 = stack->index;
while (length > 0) {
frag2->items[ind2--] = frag1->items[ind1--];
if (ind1 < 0) {
ind1 = STACK_FRAGMENT_SIZE - 1;
frag1 = frag1->data.prev;
}
if (ind2 < 0) {
ind2 = STACK_FRAGMENT_SIZE - 1;
frag2 = frag2->data.prev;
}
length--;
}
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
stack_check(stack);
#endif
stack->count += items;
return 0;
}
/* --------------------------------------------------------------------- */
/* Parser */
/* --------------------------------------------------------------------- */
enum {
/* Common. */
type_begin,
type_end,
type_char,
type_newline,
type_id,
type_rng_start,
type_rng_end,
type_rng_char,
type_rng_left,
type_rng_right,
/* generator only. */
type_branch,
type_jump,
/* Parser only. */
type_open_br,
type_close_br,
type_select,
type_asterisk,
type_plus_sign,
type_qestion_mark
};
struct compiler_common {
/* Temporary stacks. */
struct stack stack;
struct stack depth;
/* REGEX_ flags. */
int flags;
/* Encoded size of the dfa representation. */
sljit_sw dfa_size;
/* Number of terms. */
sljit_sw terms_size;
/* Number of state descriptors for one term (same as machine->no_states). */
sljit_sw no_states;
/* Number of type_rng_(char|left)-s in the longest character range. */
sljit_sw longest_range_size;
/* DFA linear representation (size: dfa_size). */
struct stack_item *dfa_transitions;
/* Term id and search state pairs (size: dfa_size). */
struct stack_item *search_states;
/* sljit compiler */
struct sljit_compiler *compiler;
/* Machine data, which must be kept for later use. */
struct regex_machine *machine;
/* Temporary space for jumps (size: longest_range_size). */
struct sljit_jump **range_jump_list;
};
static const regex_char_t* decode_number(const regex_char_t *regex_string, int length, int *result)
{
int value = 0;
SLJIT_ASSERT(length > 0);
if (*regex_string < '0' || *regex_string > '9') {
*result = -1;
return regex_string;
}
while (length > 0 && *regex_string >= '0' && *regex_string <= '9') {
value = value * 10 + (*regex_string - '0');
length--;
regex_string++;
}
*result = value;
return regex_string;
}
static int iterate(struct stack *stack, int min, int max)
{
struct stack it;
struct stack_item *item;
int count = -1;
int len = 0;
int depth = 0;
stack_clone(stack, &it);
/* Calculate size. */
while (count < 0) {
item = stack_pop(&it);
switch (item->type) {
case type_id:
case type_rng_end:
case type_rng_char:
case type_rng_left:
case type_rng_right:
case type_plus_sign:
case type_qestion_mark:
len++;
break;
case type_asterisk:
len += 2;
break;
case type_close_br:
depth++;
break;
case type_open_br:
SLJIT_ASSERT(depth > 0);
depth--;
if (depth == 0)
count = it.count;
break;
case type_select:
SLJIT_ASSERT(depth > 0);
len += 2;
break;
default:
SLJIT_ASSERT(item->type != type_begin && item->type != type_end);
if (depth == 0)
count = it.count;
len++;
break;
}
}
if (min == 0 && max == 0) {
/* {0,0} case, not {0,} case: delete subtree. */
stack_clone(&it, stack);
/* and put an empty bracket expression instead of it. */
if (stack_push(stack, type_open_br, 0))
return REGEX_MEMORY_ERROR;
if (stack_push(stack, type_close_br, 0))
return REGEX_MEMORY_ERROR;
return len;
}
count = stack->count - count;
/* Put an open bracket before the sequence. */
if (stack_push_copy(stack, 1, count))
return -1;
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
SLJIT_ASSERT(stack_push(&it, type_open_br, 0) == 0);
#else
stack_push(&it, type_open_br, 0);
#endif
/* Copy the data. */
if (max > 0) {
len = len * (max - 1);
max -= min;
/* Insert ? operators. */
len += max;
if (min > 0) {
min--;
while (min > 0) {
if (stack_push_copy(stack, count, count))
return -1;
min--;
}
if (max > 0) {
if (stack_push_copy(stack, count, count))
return -1;
if (stack_push(stack, type_qestion_mark, 0))
return REGEX_MEMORY_ERROR;
count++;
max--;
}
}
else {
SLJIT_ASSERT(max > 0);
max--;
count++;
if (stack_push(stack, type_qestion_mark, 0))
return REGEX_MEMORY_ERROR;
}
while (max > 0) {
if (stack_push_copy(stack, count, count))
return -1;
max--;
}
}
else {
SLJIT_ASSERT(min > 0);
min--;
/* Insert + operator. */
len = len * min + 1;
while (min > 0) {
if (stack_push_copy(stack, count, count))
return -1;
min--;
}
if (stack_push(stack, type_plus_sign, 0))
return REGEX_MEMORY_ERROR;
}
/* Close the opened bracket. */
if (stack_push(stack, type_close_br, 0))
return REGEX_MEMORY_ERROR;
return len;
}
static int parse_iterator(const regex_char_t *regex_string, int length, struct stack *stack, sljit_sw *dfa_size, int begin)
{
/* We only know that *regex_string == { . */
int val1, val2;
const regex_char_t *base_from = regex_string;
const regex_char_t *from;
length--;
regex_string++;
/* Decode left value. */
val2 = -1;
if (length == 0)
return -2;
if (*regex_string == ',') {
val1 = 0;
length--;
regex_string++;
}
else {
from = regex_string;
regex_string = decode_number(regex_string, length, &val1);
if (val1 < 0)
return -2;
length -= regex_string - from;
if (length == 0)
return -2;
if (*regex_string == '}') {
val2 = val1;
if (val1 == 0)
val1 = -1;
}
else if (length >= 2 && *regex_string == '!' && regex_string[1] == '}') {
/* Non posix extension. */
if (stack_push(stack, type_id, val1))
return -1;
(*dfa_size)++;
return (regex_string - base_from) + 1;
}
else {
if (*regex_string != ',')
return -2;
length--;
regex_string++;
}
}
if (begin)
return -2;
/* Decode right value. */
if (val2 == -1) {
if (length == 0)
return -2;
if (*regex_string == '}')
val2 = 0;
else {
from = regex_string;
regex_string = decode_number(regex_string, length, &val2);
length -= regex_string - from;
if (val2 < 0 || length == 0 || *regex_string != '}' || val2 < val1)
return -2;
if (val2 == 0) {
SLJIT_ASSERT(val1 == 0);
val1 = -1;
}
}
}
/* Fast cases. */
if (val1 > 1 || val2 > 1) {
val1 = iterate(stack, val1, val2);
if (val1 < 0)
return -1;
*dfa_size += val1;
}
else if (val1 == 0 && val2 == 0) {
if (stack_push(stack, type_asterisk, 0))
return -1;
*dfa_size += 2;
}
else if (val1 == 1 && val2 == 0) {
if (stack_push(stack, type_plus_sign, 0))
return -1;
(*dfa_size)++;
}
else if (val1 == 0 && val2 == 1) {
if (stack_push(stack, type_qestion_mark, 0))
return -1;
(*dfa_size)++;
}
else if (val1 == -1) {
val1 = iterate(stack, 0, 0);
if (val1 < 0)
return -1;
*dfa_size -= val1;
SLJIT_ASSERT(*dfa_size >= 2);
}
else {
/* Ignore. */
SLJIT_ASSERT(val1 == 1 && val2 == 1);
}
return regex_string - base_from;
}
static int parse_char_range(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
{
struct stack* stack = &compiler_common->stack;
const regex_char_t *base_from = regex_string;
regex_char_t left_char, right_char, tmp_char;
length--;
regex_string++;
if (length == 0)
return -2;
if (*regex_string != '^') {
if (stack_push(stack, type_rng_start, 0))
return -1;
}
else {
length--;
regex_string++;
if (length == 0)
return -2;
if (stack_push(stack, type_rng_start, 1))
return -1;
}
/* For both the type_rng_start & type_rng_end. */
compiler_common->dfa_size += 2;
/* Range must be at least 1 character. */
if (*regex_string == ']') {
length--;
regex_string++;
if (stack_push(stack, type_rng_char, ']'))
return -1;
compiler_common->dfa_size++;
}
while (1) {
if (length == 0)
return -2;
if (*regex_string == ']')
break;
if (*regex_string != '\\')
left_char = *regex_string;
else {
regex_string++;
length--;
if (length == 0)
return -2;
left_char = *regex_string;
}
regex_string++;
length--;
/* Is a range here? */
if (length >= 3 && *regex_string == '-' && *(regex_string + 1) != ']') {
regex_string++;
length--;
if (*regex_string != '\\')
right_char = *regex_string;
else {
regex_string++;
length--;
if (length == 0)
return -2;
right_char = *regex_string;
}
regex_string++;
length--;
if (left_char > right_char) {
/* Swap if necessary. */
tmp_char = left_char;
left_char = right_char;
right_char = tmp_char;
}
if (stack_push(stack, type_rng_left, left_char))
return -1;
if (stack_push(stack, type_rng_right, right_char))
return -1;
compiler_common->dfa_size += 2;
}
else {
if (stack_push(stack, type_rng_char, left_char))
return -1;
compiler_common->dfa_size++;
}
}
if (stack_push(stack, type_rng_end, 0))
return -1;
return regex_string - base_from;
}
static int parse(const regex_char_t *regex_string, int length, struct compiler_common *compiler_common)
{
/* Depth of bracketed expressions. */
int depth = 0;
/* Have we already found a term? '1' if not yet. */
int begin = 1;
/* Cache stack pointer. */
struct stack* stack = &compiler_common->stack;
int tmp;
/* Type_begin and type_end. */
compiler_common->dfa_size = 2;
stack_init(stack);
if (stack_push(stack, type_begin, 0))
return REGEX_MEMORY_ERROR;
if (length > 0 && *regex_string == '^') {
compiler_common->flags |= REGEX_MATCH_BEGIN;
length--;
regex_string++;
}
if ((compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) == (REGEX_MATCH_BEGIN | REGEX_NEWLINE)) {
/* Replace REGEX_MATCH_BEGIN flag to REGEX_FAKE_MATCH_BEGIN */
compiler_common->flags &= ~REGEX_MATCH_BEGIN;
compiler_common->flags |= REGEX_FAKE_MATCH_BEGIN;
/* and append a new-line search. */
if (stack_push(stack, type_newline, 0))
return REGEX_MEMORY_ERROR;
compiler_common->dfa_size++;
/* Begin intentionally kept as 1. */
}
while (length > 0) {
switch (*regex_string) {
case '\\' :
length--;
regex_string++;
if (length == 0)
return REGEX_INVALID_REGEX;
if (stack_push(stack, type_char, *regex_string))
return REGEX_MEMORY_ERROR;
begin = 0;
compiler_common->dfa_size++;
break;
case '.' :
if (stack_push(stack, type_rng_start, 1))
return REGEX_MEMORY_ERROR;
if (compiler_common->flags & REGEX_NEWLINE) {
if (stack_push(stack, type_rng_char, '\n'))
return REGEX_MEMORY_ERROR;
if (stack_push(stack, type_rng_char, '\r'))
return REGEX_MEMORY_ERROR;
compiler_common->dfa_size += 2;
}
if (stack_push(stack, type_rng_end, 1))
return REGEX_MEMORY_ERROR;
begin = 0;
compiler_common->dfa_size += 2;
break;
case '(' :
depth++;
if (stack_push(stack, type_open_br, 0))
return REGEX_MEMORY_ERROR;
begin = 1;
break;
case ')' :
if (depth == 0)
return REGEX_INVALID_REGEX;
depth--;
if (stack_push(stack, type_close_br, 0))
return REGEX_MEMORY_ERROR;
begin = 0;
break;
case '|' :
if (stack_push(stack, type_select, 0))
return REGEX_MEMORY_ERROR;
begin = 1;
compiler_common->dfa_size += 2;
break;
case '*' :
if (begin)
return REGEX_INVALID_REGEX;
if (stack_push(stack, type_asterisk, 0))
return REGEX_MEMORY_ERROR;
compiler_common->dfa_size += 2;
break;
case '?' :
case '+' :
if (begin)
return REGEX_INVALID_REGEX;
if (stack_push(stack, (*regex_string == '+') ? type_plus_sign : type_qestion_mark, 0))
return REGEX_MEMORY_ERROR;
compiler_common->dfa_size++;
break;
case '{' :
tmp = parse_iterator(regex_string, length, stack, &compiler_common->dfa_size, begin);
if (tmp >= 0) {
length -= tmp;
regex_string += tmp;
}
else if (tmp == -1)
return REGEX_MEMORY_ERROR;
else {
/* Not a valid range expression. */
SLJIT_ASSERT(tmp == -2);
if (stack_push(stack, type_char, '{'))
return REGEX_MEMORY_ERROR;
compiler_common->dfa_size++;
}
break;
case '[' :
tmp = parse_char_range(regex_string, length, compiler_common);
if (tmp >= 0) {
length -= tmp;
regex_string += tmp;
}
else if (tmp == -1)
return REGEX_MEMORY_ERROR;
else {
SLJIT_ASSERT(tmp == -2);
return REGEX_INVALID_REGEX;
}
begin = 0;
break;
default:
if (length == 1 && *regex_string == '$') {
compiler_common->flags |= REGEX_MATCH_END;
break;
}
if (stack_push(stack, type_char, *regex_string))
return REGEX_MEMORY_ERROR;
begin = 0;
compiler_common->dfa_size++;
break;
}
length--;
regex_string++;
}
if (depth != 0)
return REGEX_INVALID_REGEX;
if ((compiler_common->flags & (REGEX_MATCH_END | REGEX_NEWLINE)) == (REGEX_MATCH_END | REGEX_NEWLINE)) {
/* Replace REGEX_MATCH_END flag to REGEX_FAKE_MATCH_END */
compiler_common->flags &= ~REGEX_MATCH_END;
compiler_common->flags |= REGEX_FAKE_MATCH_END;
/* and append a new-line search. */
if (stack_push(stack, type_newline, 1))
return REGEX_MEMORY_ERROR;
compiler_common->dfa_size++;
/* Begin intentionally kept as 1. */
}
if (stack_push(stack, type_end, 0))
return REGEX_MEMORY_ERROR;
return REGEX_NO_ERROR;
}
/* --------------------------------------------------------------------- */
/* Generating machine state transitions */
/* --------------------------------------------------------------------- */
#define PUT_TRANSITION(typ, val) \
do { \
--transitions_ptr; \
transitions_ptr->type = typ; \
transitions_ptr->value = val; \
} while (0)
static struct stack_item* handle_iteratives(struct stack_item *transitions_ptr, struct stack_item *transitions, struct stack *depth)
{
struct stack_item *item;
while (1) {
item = stack_top(depth);
switch (item->type) {
case type_asterisk:
SLJIT_ASSERT(transitions[item->value].type == type_branch);
transitions[item->value].value = transitions_ptr - transitions;
PUT_TRANSITION(type_branch, item->value + 1);
break;
case type_plus_sign:
SLJIT_ASSERT(transitions[item->value].type == type_branch);
transitions[item->value].value = transitions_ptr - transitions;
break;
case type_qestion_mark:
PUT_TRANSITION(type_branch, item->value);
break;
default:
return transitions_ptr;
}
stack_pop(depth);
}
}
static int generate_transitions(struct compiler_common *compiler_common)
{
struct stack *stack = &compiler_common->stack;
struct stack *depth = &compiler_common->depth;
struct stack_item *transitions_ptr;
struct stack_item *item;
stack_init(depth);
compiler_common->dfa_transitions = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
if (!compiler_common->dfa_transitions)
return REGEX_MEMORY_ERROR;
/* Go through the items of the stack and generate the necessary branches and jumps (edges of DFA). */
transitions_ptr = compiler_common->dfa_transitions + compiler_common->dfa_size;
while (stack->count > 0) {
item = stack_pop(stack);
switch (item->type) {
case type_begin:
case type_open_br:
item = stack_pop(depth);
if (item->type == type_select)
PUT_TRANSITION(type_branch, item->value + 1);
else
SLJIT_ASSERT(item->type == type_close_br);
if (stack->count == 0)
PUT_TRANSITION(type_begin, 0);
else
transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
break;
case type_end:
case type_close_br:
if (item->type == type_end)
*--transitions_ptr = *item;
if (stack_push(depth, type_close_br, transitions_ptr - compiler_common->dfa_transitions))
return REGEX_MEMORY_ERROR;
break;
case type_select:
item = stack_top(depth);
if (item->type == type_select) {
SLJIT_ASSERT(compiler_common->dfa_transitions[item->value].type == type_jump);
PUT_TRANSITION(type_branch, item->value + 1);
PUT_TRANSITION(type_jump, item->value);
item->value = transitions_ptr - compiler_common->dfa_transitions;
}
else {
SLJIT_ASSERT(item->type == type_close_br);
item->type = type_select;
PUT_TRANSITION(type_jump, item->value);
item->value = transitions_ptr - compiler_common->dfa_transitions;
}
break;
case type_asterisk:
case type_plus_sign:
case type_qestion_mark:
if (item->type != type_qestion_mark)
PUT_TRANSITION(type_branch, 0);
if (stack_push(depth, item->type, transitions_ptr - compiler_common->dfa_transitions))
return REGEX_MEMORY_ERROR;
break;
case type_char:
case type_newline:
case type_rng_start:
/* Requires handle_iteratives. */
*--transitions_ptr = *item;
transitions_ptr = handle_iteratives(transitions_ptr, compiler_common->dfa_transitions, depth);
break;
default:
*--transitions_ptr = *item;
break;
}
}
SLJIT_ASSERT(compiler_common->dfa_transitions == transitions_ptr);
SLJIT_ASSERT(depth->count == 0);
return REGEX_NO_ERROR;
}
#undef PUT_TRANSITION
#ifdef REGEX_MATCH_VERBOSE
static void verbose_transitions(struct compiler_common *compiler_common)
{
struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
struct stack_item *search_states_ptr = compiler_common->search_states;
int pos;
printf("-----------------\nTransitions\n-----------------\n");
pos = 0;
while (transitions_ptr < transitions_end) {
printf("[%3d] ", pos++);
if (search_states_ptr->type >= 0)
printf("(%3d) ", search_states_ptr->type);
switch (transitions_ptr->type) {
case type_begin:
printf("type_begin\n");
break;
case type_end:
printf("type_end\n");
break;
case type_char:
if (transitions_ptr->value >= ' ')
printf("type_char '%c'\n", transitions_ptr->value);
else
printf("type_char 0x%x\n", transitions_ptr->value);
break;
case type_newline:
printf("type_newline %s\n", transitions_ptr->value ? "(end)" : "(begin)");
break;
case type_id:
printf("type_id %d\n", transitions_ptr->value);
break;
case type_rng_start:
printf("type_rng_start %s\n", transitions_ptr->value ? "(invert)" : "(normal)");
break;
case type_rng_end:
printf("type_rng_end\n");
break;
case type_rng_char:
if (transitions_ptr->value >= ' ')
printf("type_rng_char '%c'\n", transitions_ptr->value);
else
printf("type_rng_char 0x%x\n", transitions_ptr->value);
break;
case type_rng_left:
if (transitions_ptr->value >= ' ')
printf("type_rng_left '%c'\n", transitions_ptr->value);
else
printf("type_rng_left 0x%x\n", transitions_ptr->value);
break;
case type_rng_right:
if (transitions_ptr->value >= ' ')
printf("type_rng_right '%c'\n", transitions_ptr->value);
else
printf("type_rng_right 0x%x\n", transitions_ptr->value);
break;
case type_branch:
printf("type_branch -> %d\n", transitions_ptr->value);
break;
case type_jump:
printf("type_jump -> %d\n", transitions_ptr->value);
break;
default:
printf("UNEXPECTED TYPE\n");
break;
}
transitions_ptr++;
search_states_ptr++;
}
printf("flags: ");
if (!(compiler_common->flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_NEWLINE | REGEX_ID_CHECK | REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)))
printf("none ");
if (compiler_common->flags & REGEX_MATCH_BEGIN)
printf("REGEX_MATCH_BEGIN ");
if (compiler_common->flags & REGEX_MATCH_END)
printf("REGEX_MATCH_END ");
if (compiler_common->flags & REGEX_NEWLINE)
printf("REGEX_NEWLINE ");
if (compiler_common->flags & REGEX_ID_CHECK)
printf("REGEX_ID_CHECK ");
if (compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)
printf("REGEX_FAKE_MATCH_BEGIN ");
if (compiler_common->flags & REGEX_FAKE_MATCH_END)
printf("REGEX_FAKE_MATCH_END ");
if (compiler_common->longest_range_size > 0)
printf("(longest range: %ld) ", (long)compiler_common->longest_range_size);
printf("\n");
}
#endif
/* --------------------------------------------------------------------- */
/* Utilities */
/* --------------------------------------------------------------------- */
static int generate_search_states(struct compiler_common *compiler_common)
{
struct stack_item *transitions_ptr = compiler_common->dfa_transitions;
struct stack_item *transitions_end = transitions_ptr + compiler_common->dfa_size;
struct stack_item *search_states_ptr;
struct stack_item *rng_start = NULL;
compiler_common->terms_size = !(compiler_common->flags & REGEX_FAKE_MATCH_END) ? 1 : 2;
compiler_common->longest_range_size = 0;
compiler_common->search_states = SLJIT_MALLOC(sizeof(struct stack_item) * compiler_common->dfa_size, NULL);
if (!compiler_common->search_states)
return REGEX_MEMORY_ERROR;
search_states_ptr = compiler_common->search_states;
while (transitions_ptr < transitions_end) {
switch (transitions_ptr->type) {
case type_begin:
case type_end:
search_states_ptr->type = 0;
break;
case type_char:
search_states_ptr->type = compiler_common->terms_size++;
break;
case type_newline:
if (transitions_ptr->value)
search_states_ptr->type = 1;
else
search_states_ptr->type = compiler_common->terms_size++;
SLJIT_ASSERT(search_states_ptr->type == 1 || search_states_ptr->type == 2);
break;
case type_id:
if (transitions_ptr->value > 0)
compiler_common->flags |= REGEX_ID_CHECK;
search_states_ptr->type = -1;
break;
case type_rng_start:
search_states_ptr->type = compiler_common->terms_size;
rng_start = search_states_ptr;
break;
case type_rng_end:
search_states_ptr->type = compiler_common->terms_size++;
/* Ok, this is a blunt over estimation :) */
if (compiler_common->longest_range_size < search_states_ptr - rng_start)
compiler_common->longest_range_size = search_states_ptr - rng_start;
break;
default:
search_states_ptr->type = -1;
break;
}
search_states_ptr->value = -1;
search_states_ptr++;
transitions_ptr++;
}
return REGEX_NO_ERROR;
}
static int trace_transitions(int from, struct compiler_common *compiler_common)
{
int id = 0;
struct stack *stack = &compiler_common->stack;
struct stack *depth = &compiler_common->depth;
struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
struct stack_item *search_states = compiler_common->search_states;
SLJIT_ASSERT(search_states[from].type >= 0);
from++;
/* Be prepared for any paths (loops, etc). */
while (1) {
if (dfa_transitions[from].type == type_id)
if (id < dfa_transitions[from].value)
id = dfa_transitions[from].value;
if (search_states[from].value < id) {
/* Forward step. */
if (search_states[from].value == -1)
if (stack_push(stack, 0, from))
return REGEX_MEMORY_ERROR;
search_states[from].value = id;
if (dfa_transitions[from].type == type_branch) {
if (stack_push(depth, id, from))
return REGEX_MEMORY_ERROR;
from++;
continue;
}
else if (dfa_transitions[from].type == type_jump) {
from = dfa_transitions[from].value;
continue;
}
else if (search_states[from].type < 0) {
from++;
continue;
}
}
/* Back tracking. */
if (depth->count > 0) {
id = stack_top(depth)->type;
from = dfa_transitions[stack_pop(depth)->value].value;
continue;
}
return 0;
}
}
/* --------------------------------------------------------------------- */
/* Code generator */
/* --------------------------------------------------------------------- */
#define TERM_OFFSET_OF(index, offs) (((index) * no_states + (offs)) * sizeof(sljit_sw))
#define TERM_REL_OFFSET_OF(base, offs) ((base) + ((offs) * sizeof(sljit_sw)))
#define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
CHECK(sljit_emit_op1(compiler, type, arg1, arg2, arg3, arg4))
#define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
CHECK(sljit_emit_op2(compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
#define EMIT_LABEL(label) \
label = sljit_emit_label(compiler); \
CHECK(!label)
#define EMIT_JUMP(jump, type) \
jump = sljit_emit_jump(compiler, type); \
CHECK(!jump)
#define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
jump = sljit_emit_cmp(compiler, type, arg1, arg2, arg3, arg4); \
CHECK(!jump)
/* CHECK depends on the use case. */
#define CHECK(exp) \
if (SLJIT_UNLIKELY(exp)) \
return REGEX_MEMORY_ERROR
static int compile_uncond_tran(struct compiler_common *compiler_common, int reg)
{
struct sljit_compiler *compiler = compiler_common->compiler;
struct stack *stack = &compiler_common->stack;
struct stack_item *search_states = compiler_common->search_states;
int flags = compiler_common->flags;
sljit_sw no_states = compiler_common->no_states;
sljit_uw head = 0;
sljit_sw offset, value;
if (reg != R_CURR_STATE || !(compiler_common->flags & REGEX_FAKE_MATCH_BEGIN)) {
CHECK(trace_transitions(0, compiler_common));
}
else {
CHECK(trace_transitions(1, compiler_common));
}
while (stack->count > 0) {
value = stack_pop(stack)->value;
if (search_states[value].type >= 0) {
offset = TERM_OFFSET_OF(search_states[value].type, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
if (offset > 0)
head = offset;
if (!(flags & REGEX_MATCH_BEGIN)) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), R_TEMP, 0);
if (flags & REGEX_ID_CHECK) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, search_states[value].value);
}
}
else if (flags & REGEX_ID_CHECK) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, search_states[value].value);
}
}
search_states[value].value = -1;
}
if (reg == R_NEXT_STATE) {
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
}
else if (flags & REGEX_FAKE_MATCH_BEGIN) {
SLJIT_ASSERT(compiler_common->dfa_transitions[1].type == type_newline && !compiler_common->dfa_transitions[1].value);
offset = TERM_OFFSET_OF(search_states[1].type, 0);
SLJIT_ASSERT(!(flags & REGEX_MATCH_BEGIN));
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 1), SLJIT_IMM, head);
head = offset;
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 2), SLJIT_IMM, 1);
if (flags & REGEX_ID_CHECK) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(reg), TERM_REL_OFFSET_OF(offset, 3), SLJIT_IMM, 0);
}
}
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, head);
return REGEX_NO_ERROR;
}
static int compile_cond_tran(struct compiler_common *compiler_common, sljit_sw curr_index)
{
struct sljit_compiler *compiler = compiler_common->compiler;
struct stack *stack = &compiler_common->stack;
struct stack_item *search_states = compiler_common->search_states;
sljit_sw offset;
int flags;
sljit_sw no_states;
sljit_sw value;
struct sljit_jump *jump1;
struct sljit_jump *jump2;
struct sljit_jump *jump3;
struct sljit_jump *jump4;
struct sljit_jump *jump5;
struct sljit_label *label1;
flags = compiler_common->flags;
no_states = compiler_common->no_states;
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
if (!(flags & (REGEX_ID_CHECK | REGEX_MATCH_BEGIN))) {
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
}
while (stack->count > 0) {
value = stack_pop(stack)->value;
if (search_states[value].type >= 0) {
#ifdef REGEX_MATCH_VERBOSE
if (flags & REGEX_MATCH_VERBOSE)
printf("-> (%3d:%3d) ", search_states[value].type, search_states[value].value);
#endif
offset = TERM_OFFSET_OF(search_states[value].type, 0);
if (!(flags & REGEX_ID_CHECK)) {
if (!(flags & REGEX_MATCH_BEGIN)) {
/* Check whether item is inserted. */
EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
if (offset > 0) {
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
}
EMIT_JUMP(jump2, SLJIT_JUMP);
/* Check whether old index <= index. */
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
EMIT_CMP(jump1, SLJIT_LESS_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_LABEL(label1);
sljit_set_label(jump2, label1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
}
else {
/* Check whether item is inserted. */
EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
if (offset > 0) {
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
}
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
}
}
else {
if (!(flags & REGEX_MATCH_BEGIN)) {
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
/* Check whether item is inserted. */
EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
if (offset > 0) {
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
}
EMIT_JUMP(jump2, SLJIT_JUMP);
/* Check whether old index != index. */
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z | SLJIT_SET_LESS, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_JUMP(jump1, SLJIT_LESS);
EMIT_JUMP(jump3, SLJIT_NOT_EQUAL); /* Greater. */
/* Old index == index. */
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
if (search_states[value].value > 0) {
EMIT_CMP(jump4, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
EMIT_LABEL(label1);
sljit_set_label(jump4, label1);
}
EMIT_OP2(SLJIT_SUB | SLJIT_SET_GREATER_EQUAL, SLJIT_UNUSED, 0, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_JUMP(jump4, SLJIT_GREATER_EQUAL);
EMIT_JUMP(jump5, SLJIT_JUMP);
/* Overwrite index & id. */
EMIT_LABEL(label1);
sljit_set_label(jump3, label1);
sljit_set_label(jump2, label1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 3));
if (search_states[value].value > 0) {
EMIT_CMP(jump3, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
EMIT_LABEL(label1);
sljit_set_label(jump3, label1);
}
EMIT_LABEL(label1);
sljit_set_label(jump5, label1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 3 * sizeof(sljit_sw), R_TEMP, 0);
/* Exit. */
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
sljit_set_label(jump4, label1);
}
else {
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(curr_index, 2));
if (search_states[value].value > 0) {
EMIT_CMP(jump1, SLJIT_GREATER, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, search_states[value].value);
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
}
/* Check whether item is inserted. */
EMIT_CMP(jump1, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), SLJIT_IMM, -1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + sizeof(sljit_sw), R_NEXT_HEAD, 0);
if (offset > 0) {
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, offset);
}
EMIT_JUMP(jump2, SLJIT_JUMP);
/* Check whether old id >= id. */
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
EMIT_CMP(jump1, SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_LABEL(label1);
sljit_set_label(jump2, label1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), offset + 2 * sizeof(sljit_sw), R_TEMP, 0);
EMIT_LABEL(label1);
sljit_set_label(jump1, label1);
}
}
}
search_states[value].value = -1;
}
#ifdef REGEX_MATCH_VERBOSE
if (flags & REGEX_MATCH_VERBOSE)
printf("\n");
#endif
return REGEX_NO_ERROR;
}
static int compile_end_check(struct compiler_common *compiler_common, struct sljit_label *end_check_label)
{
struct sljit_compiler *compiler = compiler_common->compiler;
struct sljit_jump *jump;
struct sljit_jump *clear_states_jump;
struct sljit_label *label;
struct sljit_label *leave_label;
struct sljit_label *begin_loop_label;
/* Priority order: best_begin > best_end > best_id.
In other words:
if (new best_begin > old test_begin) do nothing
otherwise we know that new_end > old_end, since R_CURR_INDEX ever increasing
therefore we must overwrite all best_* variables (new_id also contains the highest id for this turn). */
/* Both R_CURR_CHAR and R_BEST_BEGIN used as temporary registers. */
if (!(compiler_common->flags & REGEX_MATCH_BEGIN)) {
EMIT_OP1(SLJIT_MOV, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
EMIT_CMP(jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_LESS : SLJIT_LESS_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
sljit_set_label(jump, end_check_label);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), R_CURR_CHAR, 0);
if (!(compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END))) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
}
else {
if ((compiler_common->flags & (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) == (REGEX_FAKE_MATCH_BEGIN | REGEX_FAKE_MATCH_END)) {
EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 2);
}
else {
EMIT_OP2(SLJIT_SUB, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0, SLJIT_IMM, 1);
}
}
if (compiler_common->flags & REGEX_ID_CHECK) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 3));
}
EMIT_CMP(clear_states_jump, SLJIT_LESS, R_CURR_CHAR, 0, R_BEST_BEGIN, 0);
EMIT_LABEL(leave_label);
EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, R_CURR_CHAR, 0);
EMIT_JUMP(jump, SLJIT_JUMP);
sljit_set_label(jump, end_check_label);
/* A loop to clear all states, which are > (or >=) than R_CURR_CHAR. */
EMIT_LABEL(label);
sljit_set_label(clear_states_jump, label);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
/* Begin of the loop. */
EMIT_LABEL(begin_loop_label);
EMIT_CMP(jump, SLJIT_EQUAL, R_TEMP, 0, SLJIT_IMM, 0);
sljit_set_label(jump, leave_label);
EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, R_CURR_STATE, 0);
EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw));
EMIT_CMP(clear_states_jump, !(compiler_common->flags & REGEX_MATCH_NON_GREEDY) ? SLJIT_GREATER : SLJIT_GREATER_EQUAL, SLJIT_MEM1(R_TEMP), 2 * sizeof(sljit_sw), R_CURR_CHAR, 0);
/* Case 1: keep this case. */
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), R_NEXT_HEAD, 0);
EMIT_OP2(SLJIT_SUB, R_NEXT_HEAD, 0, R_TEMP, 0, R_CURR_STATE, 0);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
EMIT_JUMP(jump, SLJIT_JUMP);
sljit_set_label(jump, begin_loop_label);
/* Case 2: remove this case. */
EMIT_LABEL(label);
sljit_set_label(clear_states_jump, label);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_TEMP), sizeof(sljit_sw), SLJIT_IMM, -1);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_BEST_BEGIN, 0);
EMIT_JUMP(jump, SLJIT_JUMP);
sljit_set_label(jump, begin_loop_label);
}
else {
EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_IMM, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
if (compiler_common->flags & REGEX_ID_CHECK) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
}
EMIT_JUMP(jump, SLJIT_JUMP);
sljit_set_label(jump, end_check_label);
}
return REGEX_NO_ERROR;
}
static int compile_leave_fast_forward(struct compiler_common *compiler_common, struct sljit_label *fast_forward_label)
{
struct sljit_compiler *compiler = compiler_common->compiler;
struct stack *stack = &compiler_common->stack;
struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
struct stack_item *search_states = compiler_common->search_states;
int ind;
struct sljit_jump *jump;
int init_range = 1, prev_value = 0;
while (stack->count > 0) {
ind = stack_pop(stack)->value;
search_states[ind].value = -1;
if (search_states[ind].type >= 0) {
if (dfa_transitions[ind].type == type_char) {
EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
sljit_set_label(jump, fast_forward_label);
}
else if (dfa_transitions[ind].type == type_rng_start) {
SLJIT_ASSERT(!dfa_transitions[ind].value);
ind++;
while (dfa_transitions[ind].type != type_rng_end) {
if (dfa_transitions[ind].type == type_rng_char) {
EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
sljit_set_label(jump, fast_forward_label);
}
else {
SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
if (init_range) {
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
init_range = 0;
}
if (dfa_transitions[ind].value != prev_value) {
/* Best compatibility to all archs. */
prev_value -= dfa_transitions[ind].value;
if (prev_value < 0) {
EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
}
else {
EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
}
prev_value = dfa_transitions[ind].value;
}
EMIT_CMP(jump, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
sljit_set_label(jump, fast_forward_label);
ind++;
}
ind++;
}
}
else {
SLJIT_ASSERT(dfa_transitions[ind].type == type_newline);
EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
sljit_set_label(jump, fast_forward_label);
EMIT_CMP(jump, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
sljit_set_label(jump, fast_forward_label);
}
}
}
return REGEX_NO_ERROR;
}
static int compile_newline_check(struct compiler_common *compiler_common, sljit_sw ind)
{
struct sljit_compiler *compiler = compiler_common->compiler;
struct sljit_jump *jump1;
struct sljit_jump *jump2;
struct sljit_label *label;
sljit_sw no_states;
sljit_sw offset;
/* Check whether a new-line character is found. */
EMIT_CMP(jump1, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\n');
EMIT_CMP(jump2, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, '\r');
no_states = compiler_common->no_states;
offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
EMIT_LABEL(label);
sljit_set_label(jump1, label);
sljit_set_label(jump2, label);
return REGEX_NO_ERROR;
}
#undef CHECK
#define CHECK(exp) \
if (SLJIT_UNLIKELY(exp)) \
return 0
static SLJIT_INLINE void range_set_label(struct sljit_jump **range_jump_list, struct sljit_label *label)
{
while (*range_jump_list) {
sljit_set_label(*range_jump_list, label);
range_jump_list++;
}
}
static sljit_sw compile_range_check(struct compiler_common *compiler_common, sljit_sw ind)
{
struct sljit_compiler *compiler = compiler_common->compiler;
struct stack_item *dfa_transitions = compiler_common->dfa_transitions;
struct sljit_jump **range_jump_list = compiler_common->range_jump_list;
int invert = dfa_transitions[ind].value;
struct sljit_label *label;
sljit_sw no_states;
sljit_sw offset;
int init_range = 1, prev_value = 0;
ind++;
while (dfa_transitions[ind].type != type_rng_end) {
if (dfa_transitions[ind].type == type_rng_char) {
EMIT_CMP(*range_jump_list, SLJIT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, dfa_transitions[ind].value);
range_jump_list++;
}
else {
SLJIT_ASSERT(dfa_transitions[ind].type == type_rng_left);
if (init_range) {
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_CHAR, 0);
init_range = 0;
}
if (dfa_transitions[ind].value != prev_value) {
/* Best compatibility to all archs. */
prev_value -= dfa_transitions[ind].value;
if (prev_value < 0) {
EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, -prev_value);
}
else {
EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, prev_value);
}
prev_value = dfa_transitions[ind].value;
}
EMIT_CMP(*range_jump_list, SLJIT_LESS_EQUAL, R_TEMP, 0, SLJIT_IMM, dfa_transitions[ind + 1].value - dfa_transitions[ind].value);
range_jump_list++;
ind++;
}
ind++;
}
*range_jump_list = NULL;
if (!invert) {
no_states = compiler_common->no_states;
offset = TERM_OFFSET_OF(compiler_common->search_states[ind].type, 1);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), offset);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), offset, SLJIT_IMM, -1);
CHECK(sljit_emit_ijump(compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
EMIT_LABEL(label);
range_set_label(compiler_common->range_jump_list, label);
/* Clears the jump list. */
*compiler_common->range_jump_list = NULL;
}
return ind;
}
#undef TERM_OFFSET_OF
#undef EMIT_OP1
#undef EMIT_OP2
#undef EMIT_LABEL
#undef EMIT_JUMP
#undef EMIT_CMP
#undef CHECK
/* --------------------------------------------------------------------- */
/* Main compiler */
/* --------------------------------------------------------------------- */
#define TERM_OFFSET_OF(ind, offs) (((ind) * compiler_common.no_states + (offs)) * sizeof(sljit_sw))
#define EMIT_OP1(type, arg1, arg2, arg3, arg4) \
CHECK(sljit_emit_op1(compiler_common.compiler, type, arg1, arg2, arg3, arg4))
#define EMIT_OP2(type, arg1, arg2, arg3, arg4, arg5, arg6) \
CHECK(sljit_emit_op2(compiler_common.compiler, type, arg1, arg2, arg3, arg4, arg5, arg6))
#define EMIT_LABEL(label) \
label = sljit_emit_label(compiler_common.compiler); \
CHECK(!label)
#define EMIT_JUMP(jump, type) \
jump = sljit_emit_jump(compiler_common.compiler, type); \
CHECK(!jump)
#define EMIT_CMP(jump, type, arg1, arg2, arg3, arg4) \
jump = sljit_emit_cmp(compiler_common.compiler, type, arg1, arg2, arg3, arg4); \
CHECK(!jump)
/* A do {} while(0) expression helps to avoid goto statements. */
#define BEGIN_GUARD \
do {
#define END_GUARD \
} while(0);
#define CHECK(exp) \
if (SLJIT_UNLIKELY(exp)) \
break;
struct regex_machine* regex_compile(const regex_char_t *regex_string, int length, int re_flags, int *error)
{
struct compiler_common compiler_common;
sljit_sw ind;
int error_code, done, suggest_fast_forward;
/* ID of an empty match (-1 if not reachable). */
int empty_match_id;
struct sljit_jump *jump;
struct sljit_jump *best_match_found_jump;
struct sljit_jump *fast_forward_jump = NULL;
struct sljit_jump *length_is_zero_jump;
struct sljit_jump *end_check_jump = NULL;
struct sljit_jump *best_match_check_jump = NULL;
struct sljit_jump *non_greedy_end_jump = NULL;
struct sljit_label *label;
struct sljit_label *end_check_label = NULL;
struct sljit_label *start_label;
struct sljit_label *fast_forward_label;
struct sljit_label *fast_forward_return_label;
if (error)
*error = REGEX_NO_ERROR;
#ifdef REGEX_MATCH_VERBOSE
compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE | REGEX_MATCH_VERBOSE);
#else
compiler_common.flags = re_flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END | REGEX_MATCH_NON_GREEDY | REGEX_NEWLINE);
#endif
/* Step 1: parsing (Left->Right).
Syntax check and AST generator. */
error_code = parse(regex_string, length, &compiler_common);
if (error_code) {
stack_destroy(&compiler_common.stack);
if (error)
*error = error_code;
return NULL;
}
/* Step 2: generating branches (Right->Left). */
error_code = generate_transitions(&compiler_common);
stack_destroy(&compiler_common.stack);
stack_destroy(&compiler_common.depth);
if (error_code) {
if (compiler_common.dfa_transitions)
SLJIT_FREE(compiler_common.dfa_transitions, NULL);
if (error)
*error = error_code;
return NULL;
}
/* Step 3: Generate necessary data for depth-first search (Left->Right). */
error_code = generate_search_states(&compiler_common);
if (error_code) {
SLJIT_FREE(compiler_common.dfa_transitions, NULL);
if (error)
*error = error_code;
return NULL;
}
#ifdef REGEX_MATCH_VERBOSE
if (compiler_common.flags & REGEX_MATCH_VERBOSE)
verbose_transitions(&compiler_common);
#endif
/* Step 4: Left->Right generate code. */
stack_init(&compiler_common.stack);
stack_init(&compiler_common.depth);
done = 0;
compiler_common.machine = NULL;
compiler_common.compiler = NULL;
compiler_common.range_jump_list = NULL;
BEGIN_GUARD
compiler_common.machine = (struct regex_machine*)SLJIT_MALLOC(sizeof(struct regex_machine) + (compiler_common.terms_size - 1) * sizeof(sljit_uw), NULL);
CHECK(!compiler_common.machine);
compiler_common.compiler = sljit_create_compiler(NULL);
CHECK(!compiler_common.compiler);
if (compiler_common.longest_range_size > 0) {
compiler_common.range_jump_list = (struct sljit_jump**)SLJIT_MALLOC(sizeof(struct sljit_jump*) * compiler_common.longest_range_size, NULL);
CHECK(!compiler_common.range_jump_list);
}
if ((compiler_common.flags & REGEX_ID_CHECK) && !(compiler_common.flags & REGEX_MATCH_BEGIN))
compiler_common.no_states = 4;
else if (!(compiler_common.flags & REGEX_ID_CHECK) && (compiler_common.flags & REGEX_MATCH_BEGIN))
compiler_common.no_states = 2;
else
compiler_common.no_states = 3;
compiler_common.machine->flags = compiler_common.flags;
compiler_common.machine->no_states = compiler_common.no_states;
compiler_common.machine->size = compiler_common.machine->no_states * compiler_common.terms_size;
/* Study the regular expression. */
empty_match_id = -1;
suggest_fast_forward = 1;
if (!(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN)) {
CHECK(trace_transitions(0, &compiler_common));
while (compiler_common.stack.count > 0) {
ind = stack_pop(&compiler_common.stack)->value;
if (compiler_common.search_states[ind].type == 0) {
SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
suggest_fast_forward = 0;
empty_match_id = compiler_common.search_states[ind].value;
}
else if (compiler_common.search_states[ind].type > 0) {
SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type != type_end);
if (compiler_common.dfa_transitions[ind].type == type_rng_start && compiler_common.dfa_transitions[ind].value)
suggest_fast_forward = 0;
}
compiler_common.search_states[ind].value = -1;
}
}
else {
SLJIT_ASSERT(compiler_common.dfa_transitions[1].type == type_newline);
CHECK(trace_transitions(1, &compiler_common));
while (compiler_common.stack.count > 0) {
ind = stack_pop(&compiler_common.stack)->value;
if (compiler_common.search_states[ind].type == 0) {
SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_end);
suggest_fast_forward = 0;
empty_match_id = compiler_common.search_states[ind].value;
}
compiler_common.search_states[ind].value = -1;
}
}
/* Step 4.1: Generate entry. */
CHECK(sljit_emit_enter(compiler_common.compiler, 0, 3, 5, 5, 0, 0, 0));
/* Copy arguments to their place. */
EMIT_OP1(SLJIT_MOV, R_REGEX_MATCH, 0, SLJIT_S0, 0);
EMIT_OP1(SLJIT_MOV, R_STRING, 0, SLJIT_S1, 0);
EMIT_OP2(SLJIT_ADD, R_LENGTH, 0, SLJIT_S2, 0, SLJIT_IMM, 1);
/* Init global registers. */
EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
EMIT_OP1(SLJIT_MOV, R_BEST_BEGIN, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin));
EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index));
/* Check whether the best match has already found in a previous frame. */
EMIT_CMP(jump, SLJIT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 0);
EMIT_JUMP(best_match_found_jump, SLJIT_JUMP);
#ifdef REGEX_MATCH_VERBOSE
if (compiler_common.flags & REGEX_MATCH_VERBOSE)
printf("\n-----------------\nTrace\n-----------------\n");
#endif
/* Step 4.2: Generate code for state 0. */
EMIT_LABEL(label);
compiler_common.machine->entry_addrs[0] = (sljit_uw)label;
/* Swapping current and next. */
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_STATE, 0);
EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_NEXT_STATE, 0);
EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_TEMP, 0);
/* Checking whether the best case needs to be updated. */
if (!(compiler_common.flags & REGEX_MATCH_END)) {
EMIT_CMP(end_check_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
EMIT_LABEL(end_check_label);
}
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_NEXT_STATE), TERM_REL_OFFSET_OF(0, 1), SLJIT_IMM, -1);
EMIT_OP2(SLJIT_ADD, R_CURR_INDEX, 0, R_CURR_INDEX, 0, SLJIT_IMM, 1);
/* Checking whether best case has already found. */
if (!(compiler_common.flags & REGEX_MATCH_END) || (compiler_common.flags & REGEX_MATCH_BEGIN)) {
if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
/* We can bail out if no more active states remain and R_BEST_BEGIN != -1. */
EMIT_CMP(best_match_check_jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
}
else {
/* We can bail out if no more active states remain (regardless of R_BEST_BEGIN). */
EMIT_CMP(best_match_check_jump, SLJIT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
}
}
EMIT_LABEL(start_label);
sljit_set_label(jump, start_label);
if (!(compiler_common.flags & REGEX_MATCH_BEGIN) && suggest_fast_forward) {
EMIT_CMP(fast_forward_jump, SLJIT_NOT_EQUAL, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
}
/* Loading the next character. */
EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_LENGTH, 0, R_LENGTH, 0, SLJIT_IMM, 1);
EMIT_JUMP(length_is_zero_jump, SLJIT_EQUAL);
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_STRING, 0);
#ifdef REGEX_USE_8BIT_CHARS
EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 1);
#else
EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_TEMP), 0);
EMIT_OP2(SLJIT_ADD, R_TEMP, 0, R_TEMP, 0, SLJIT_IMM, 2);
#endif
EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_TEMP, 0);
#ifdef REGEX_MATCH_VERBOSE
if (compiler_common.flags & REGEX_MATCH_VERBOSE) {
printf("(%3d): ", 0);
CHECK(trace_transitions(0, &compiler_common));
while (compiler_common.stack.count > 0) {
ind = stack_pop(&compiler_common.stack)->value;
if (compiler_common.search_states[ind].type >= 0)
printf("-> (%3d:%3d) ", compiler_common.search_states[ind].type, compiler_common.search_states[ind].value);
compiler_common.search_states[ind].value = -1;
}
printf("\n");
}
#endif
EMIT_LABEL(fast_forward_return_label);
if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 1);
if (!(compiler_common.flags & REGEX_MATCH_END)) {
EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_BEST_BEGIN, 0, SLJIT_IMM, -1);
}
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_CURR_INDEX, 0);
CHECK(compile_uncond_tran(&compiler_common, R_NEXT_STATE));
/* And branching to the first state. */
CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
if (!(compiler_common.flags & REGEX_MATCH_END)) {
EMIT_LABEL(label);
sljit_set_label(jump, label);
}
}
/* This is the case where we only have to reset the R_NEXT_HEAD. */
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, R_NEXT_HEAD, 0);
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
/* Fast-forward loop. */
if (fast_forward_jump) {
/* Quit from fast-forward loop. */
EMIT_LABEL(fast_forward_label);
EMIT_OP2(SLJIT_SUB, R_TEMP, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
EMIT_OP1(SLJIT_MOV, R_LENGTH, 0, R_NEXT_STATE, 0);
EMIT_OP1(SLJIT_MOV, R_STRING, 0, R_CURR_STATE, 0);
EMIT_OP1(SLJIT_MOV, R_CURR_INDEX, 0, R_NEXT_HEAD, 0);
EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next));
EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current));
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head));
/* Update the start field of the locations. */
CHECK(trace_transitions(0, &compiler_common));
while (compiler_common.stack.count > 0) {
ind = stack_pop(&compiler_common.stack)->value;
if (compiler_common.search_states[ind].type >= 0) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 2), R_TEMP, 0);
}
compiler_common.search_states[ind].value = -1;
}
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_forward), SLJIT_IMM, 0);
EMIT_JUMP(jump, SLJIT_JUMP);
sljit_set_label(jump, fast_forward_return_label);
/* Start fast-forward. */
EMIT_LABEL(label);
sljit_set_label(fast_forward_jump, label);
/* Moving everything to registers. */
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
EMIT_OP1(SLJIT_MOV, R_NEXT_STATE, 0, R_LENGTH, 0);
EMIT_OP1(SLJIT_MOV, R_CURR_STATE, 0, R_STRING, 0);
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, R_CURR_INDEX, 0);
/* Fast forward mainloop. */
EMIT_LABEL(label);
EMIT_OP2(SLJIT_SUB | SLJIT_SET_Z, R_NEXT_STATE, 0, R_NEXT_STATE, 0, SLJIT_IMM, 1);
EMIT_JUMP(fast_forward_jump, SLJIT_EQUAL);
#ifdef REGEX_USE_8BIT_CHARS
EMIT_OP1(SLJIT_MOV_U8, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 1);
#else
EMIT_OP1(SLJIT_MOV_UH, R_CURR_CHAR, 0, SLJIT_MEM1(R_CURR_STATE), 0);
EMIT_OP2(SLJIT_ADD, R_CURR_STATE, 0, R_CURR_STATE, 0, SLJIT_IMM, 2);
#endif
CHECK(trace_transitions(0, &compiler_common));
CHECK(compile_leave_fast_forward(&compiler_common, fast_forward_label));
EMIT_OP2(SLJIT_ADD, R_NEXT_HEAD, 0, R_NEXT_HEAD, 0, SLJIT_IMM, 1);
EMIT_JUMP(jump, SLJIT_JUMP);
sljit_set_label(jump, label);
/* String is finished. */
EMIT_LABEL(label);
sljit_set_label(fast_forward_jump, label);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_NEXT_HEAD, 0);
EMIT_JUMP(fast_forward_jump, SLJIT_JUMP);
}
/* End check. */
if (end_check_jump) {
EMIT_LABEL(label);
sljit_set_label(end_check_jump, label);
if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || !(compiler_common.flags & REGEX_MATCH_BEGIN)) {
CHECK(compile_end_check(&compiler_common, end_check_label));
}
else {
/* Since we leave, we do not need to update the R_BEST_BEGIN. */
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_end), R_CURR_INDEX, 0);
if (compiler_common.flags & REGEX_ID_CHECK) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_MEM1(R_CURR_STATE), TERM_REL_OFFSET_OF(0, 2));
}
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
EMIT_JUMP(non_greedy_end_jump, SLJIT_JUMP);
}
}
/* Finish check. */
if (best_match_check_jump) {
EMIT_LABEL(label);
sljit_set_label(best_match_check_jump, label);
if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
sljit_set_label(jump, start_label);
}
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, fast_quit), SLJIT_IMM, 1);
}
/* Leaving matching and storing the necessary values. */
EMIT_LABEL(label);
sljit_set_label(length_is_zero_jump, label);
if (non_greedy_end_jump)
sljit_set_label(non_greedy_end_jump, label);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, index), R_CURR_INDEX, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, head), R_NEXT_HEAD, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, next), R_NEXT_STATE, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_REGEX_MATCH), SLJIT_OFFSETOF(struct regex_match, current), R_CURR_STATE, 0);
/* Exit from JIT. */
EMIT_LABEL(label);
sljit_set_label(best_match_found_jump, label);
if (fast_forward_jump)
sljit_set_label(fast_forward_jump, label);
CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_UNUSED, 0, 0));
for (ind = 1; ind < compiler_common.dfa_size - 1; ind++) {
if (compiler_common.search_states[ind].type >= 0) {
SLJIT_ASSERT(compiler_common.search_states[ind].type < compiler_common.terms_size);
EMIT_LABEL(label);
compiler_common.machine->entry_addrs[compiler_common.search_states[ind].type] = (sljit_uw)label;
if (compiler_common.dfa_transitions[ind].type == type_char) {
EMIT_CMP(jump, SLJIT_NOT_EQUAL, R_CURR_CHAR, 0, SLJIT_IMM, compiler_common.dfa_transitions[ind].value);
}
else if (compiler_common.dfa_transitions[ind].type == type_rng_start) {
ind = compile_range_check(&compiler_common, ind);
CHECK(!ind);
}
else {
SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
CHECK(compile_newline_check(&compiler_common, ind));
}
CHECK(trace_transitions(ind, &compiler_common));
#ifdef REGEX_MATCH_VERBOSE
if (compiler_common.flags & REGEX_MATCH_VERBOSE)
printf("(%3d): ", compiler_common.search_states[ind].type);
#endif
CHECK(compile_cond_tran(&compiler_common, compiler_common.search_states[ind].type));
if (compiler_common.dfa_transitions[ind].type == type_char) {
EMIT_LABEL(label);
sljit_set_label(jump, label);
}
else if (compiler_common.dfa_transitions[ind].type == type_rng_end) {
EMIT_LABEL(label);
range_set_label(compiler_common.range_jump_list, label);
}
else {
SLJIT_ASSERT(compiler_common.dfa_transitions[ind].type == type_newline);
}
/* Branch to the next item in the list. */
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1));
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(R_CURR_STATE), TERM_OFFSET_OF(compiler_common.search_states[ind].type, 1), SLJIT_IMM, -1);
CHECK(sljit_emit_ijump(compiler_common.compiler, SLJIT_JUMP, SLJIT_MEM2(R_CURR_STATE, R_TEMP), 0));
}
}
if (ind == compiler_common.dfa_size - 1) {
/* Generate an init stub function. */
EMIT_LABEL(label);
CHECK(sljit_emit_enter(compiler_common.compiler, 0, 2, 3, 3, 0, 0, 0));
if (empty_match_id == -1) {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, -1);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, 0);
}
else {
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_begin), SLJIT_IMM, 0);
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, best_id), SLJIT_IMM, empty_match_id);
}
EMIT_OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_S1), SLJIT_OFFSETOF(struct regex_match, index), SLJIT_IMM, !(compiler_common.flags & REGEX_FAKE_MATCH_BEGIN) ? 1 : 2);
if (!(compiler_common.flags & REGEX_MATCH_NON_GREEDY) || empty_match_id == -1) {
/* The else is a really rare event, so we still generate an empty function instead of a runtime pointer check. */
SLJIT_ASSERT(R_CURR_STATE == SLJIT_S0);
if (!(compiler_common.flags & REGEX_MATCH_BEGIN)) {
/* R_CURR_INDEX (put to R_TEMP) is zero. */
EMIT_OP1(SLJIT_MOV, R_TEMP, 0, SLJIT_IMM, 0);
}
CHECK(compile_uncond_tran(&compiler_common, R_CURR_STATE));
}
else {
EMIT_OP1(SLJIT_MOV, R_NEXT_HEAD, 0, SLJIT_IMM, 0);
}
CHECK(sljit_emit_return(compiler_common.compiler, SLJIT_MOV, R_NEXT_HEAD, 0));
compiler_common.machine->continue_match = sljit_generate_code(compiler_common.compiler);
#ifndef SLJIT_INDIRECT_CALL
compiler_common.machine->u.init_match = (void*)(sljit_sw)sljit_get_label_addr(label);
#else
sljit_set_function_context(&compiler_common.machine->u.init_match, &compiler_common.machine->context, sljit_get_label_addr(label), regex_compile);
#endif
#ifdef REGEX_MATCH_VERBOSE
if (compiler_common.flags & REGEX_MATCH_VERBOSE)
printf("Continue match: %p Init match: %p\n\n", compiler_common.machine->continue_match, compiler_common.machine->u.init_match);
#endif
if (compiler_common.machine->continue_match) {
for (ind = 0; ind < compiler_common.terms_size; ++ind)
compiler_common.machine->entry_addrs[ind] = sljit_get_label_addr((struct sljit_label*)compiler_common.machine->entry_addrs[ind]);
done = 1;
}
}
END_GUARD
stack_destroy(&compiler_common.stack);
stack_destroy(&compiler_common.depth);
SLJIT_FREE(compiler_common.dfa_transitions, NULL);
SLJIT_FREE(compiler_common.search_states, NULL);
if (compiler_common.range_jump_list)
SLJIT_FREE(compiler_common.range_jump_list, NULL);
if (compiler_common.compiler)
sljit_free_compiler(compiler_common.compiler);
if (done)
return compiler_common.machine;
if (compiler_common.machine) {
SLJIT_FREE(compiler_common.machine, NULL);
}
if (error)
*error = REGEX_MEMORY_ERROR;
return NULL;
}
#undef TERM_OFFSET_OF
#undef EMIT_OP1
#undef EMIT_OP2
#undef EMIT_LABEL
#undef EMIT_JUMP
#undef EMIT_CMP
#undef BEGIN_GUARD
#undef END_GUARD
#undef CHECK
void regex_free_machine(struct regex_machine *machine)
{
sljit_free_code(machine->continue_match);
SLJIT_FREE(machine, NULL);
}
const char* regex_get_platform_name(void)
{
return sljit_get_platform_name();
}
/* --------------------------------------------------------------------- */
/* Mathching utilities */
/* --------------------------------------------------------------------- */
struct regex_match* regex_begin_match(struct regex_machine *machine)
{
sljit_sw *ptr1;
sljit_sw *ptr2;
sljit_sw *end;
sljit_sw *entry_addrs;
struct regex_match *match = (struct regex_match*)SLJIT_MALLOC(sizeof(struct regex_match) + (machine->size * 2 - 1) * sizeof(sljit_sw), NULL);
if (!match)
return NULL;
ptr1 = match->states;
ptr2 = match->states + machine->size;
end = ptr2;
entry_addrs = (sljit_sw*)machine->entry_addrs;
match->current = ptr1;
match->next = ptr2;
match->head = 0;
match->machine = machine;
/* Init machine states. */
switch (machine->no_states) {
case 2:
while (ptr1 < end) {
*ptr1++ = *entry_addrs;
*ptr2++ = *entry_addrs++;
*ptr1++ = -1;
*ptr2++ = -1;
}
break;
case 3:
while (ptr1 < end) {
*ptr1++ = *entry_addrs;
*ptr2++ = *entry_addrs++;
*ptr1++ = -1;
*ptr2++ = -1;
*ptr1++ = 0;
*ptr2++ = 0;
}
break;
case 4:
while (ptr1 < end) {
*ptr1++ = *entry_addrs;
*ptr2++ = *entry_addrs++;
*ptr1++ = -1;
*ptr2++ = -1;
*ptr1++ = 0;
*ptr2++ = 0;
*ptr1++ = 0;
*ptr2++ = 0;
}
break;
default:
SLJIT_UNREACHABLE();
break;
}
SLJIT_ASSERT(ptr1 == end);
match->u.continue_match = machine->continue_match;
regex_reset_match(match);
return match;
}
void regex_reset_match(struct regex_match *match)
{
struct regex_machine *machine = match->machine;
sljit_sw current, ind;
sljit_sw *current_ptr;
match->best_end = 0;
match->fast_quit = 0;
match->fast_forward = 0;
if (match->head != 0) {
/* Clear the current state. */
current = match->head;
current_ptr = match->current;
do {
ind = (current / sizeof(sljit_sw)) + 1;
current = current_ptr[ind];
current_ptr[ind] = -1;
} while (current != 0);
}
match->head = machine->u.call_init(match->current, match);
}
void regex_free_match(struct regex_match *match)
{
SLJIT_FREE(match, NULL);
}
void regex_continue_match(struct regex_match *match, const regex_char_t *input_string, int length)
{
match->u.call_continue(match, input_string, length);
}
int regex_get_result(struct regex_match *match, int *end, int *id)
{
int flags = match->machine->flags;
sljit_sw no_states;
*end = match->best_end;
*id = match->best_id;
if (!(flags & (REGEX_MATCH_END | REGEX_FAKE_MATCH_END)))
return match->best_begin;
if (flags & REGEX_FAKE_MATCH_END) {
SLJIT_ASSERT(!(flags & (REGEX_MATCH_BEGIN | REGEX_MATCH_END)));
if (match->best_begin != -1)
return match->best_begin;
no_states = match->machine->no_states;
if (match->current[no_states + 1] == -1)
return -1;
if (flags & REGEX_ID_CHECK)
*id = match->current[no_states + 3];
if (!(flags & REGEX_FAKE_MATCH_BEGIN))
*end = match->index - 1;
else
*end = match->index - 2;
return match->current[no_states + 2];
}
else {
/* Check the status of the last code. */
if (!(flags & REGEX_MATCH_BEGIN)) {
/* No shortcut in this case. */
if (!(flags & REGEX_ID_CHECK)) {
if (match->current[1] == -1)
return -1;
*end = match->index - 1;
return match->current[2];
}
if (match->current[1] == -1)
return -1;
*end = match->index - 1;
*id = match->current[3];
return match->current[2];
}
/* Shortcut is possible in this case. */
if (!(flags & REGEX_ID_CHECK)) {
if (match->current[1] == -1 || match->head == -1)
return -1;
*end = match->index - 1;
return 0;
}
if (match->current[1] == -1 || match->head == -1)
return -1;
*end = match->index - 1;
*id = match->current[2];
return 0;
}
}
int regex_is_match_finished(struct regex_match *match)
{
return match->fast_quit;
}
#ifdef REGEX_MATCH_VERBOSE
void regex_continue_match_debug(struct regex_match *match, const regex_char_t *input_string, int length)
{
sljit_sw *ptr;
sljit_sw *end;
sljit_sw count;
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
sljit_sw current;
#endif
sljit_sw no_states = match->machine->no_states;
sljit_sw len = match->machine->size;
while (length > 0) {
match->u.call_continue(match, input_string, 1);
if (match->fast_forward) {
if (match->machine->flags & REGEX_MATCH_VERBOSE)
printf("fast forward\n");
}
/* Verbose (first). */
if (match->machine->flags & REGEX_MATCH_VERBOSE) {
ptr = match->current;
end = ptr + len;
count = 0;
printf("'%c' (%3ld->%3ld [%3ld]) ", *input_string, (long)match->best_begin, (long)match->best_end, (long)match->best_id);
while (ptr < end) {
printf("[%3ld:", (long)count++);
switch (no_states) {
case 2:
if (ptr[1] != -1)
printf("+] ");
else
printf(" ] ");
break;
case 3:
if (ptr[1] != -1)
printf("+,%3ld] ", (long)ptr[2]);
else
printf(" ,XXX] ");
break;
case 4:
if (ptr[1] != -1)
printf("+,%3ld,%3ld] ", (long)ptr[2], (long)ptr[3]);
else
printf(" ,XXX,XXX] ");
break;
}
ptr += no_states;
}
printf("\n");
}
#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
/* Sanity check (later). */
ptr = match->next;
end = ptr + len;
while (ptr < end) {
SLJIT_ASSERT(ptr[1] == -1);
ptr += no_states;
}
/* Check number of active elements. */
ptr = match->current + no_states;
end = ptr + len - no_states;
count = 0;
while (ptr < end) {
if (ptr[1] != -1)
count++;
ptr += no_states;
}
/* Check chain list. */
current = match->head;
ptr = match->current;
while (current != 0) {
SLJIT_ASSERT(current >= 0 && current < len * sizeof(sljit_sw));
SLJIT_ASSERT((current % (no_states * sizeof(sljit_sw))) == 0);
SLJIT_ASSERT(count > 0);
current = ptr[(current / sizeof(sljit_sw)) + 1];
count--;
}
SLJIT_ASSERT(count == 0);
#endif
if (match->fast_quit) {
/* the machine has stopped working. */
if (match->machine->flags & REGEX_MATCH_VERBOSE)
printf("Best match has found\n");
break;
}
input_string++;
length--;
}
}
#endif