libregexis024/src/libregexis024sol/expr_parse_functions/ep_sequence.cpp

223 lines
11 KiB
C++
Raw Normal View History

2024-07-28 16:54:57 +00:00
#include <libregexis024sol/expr_parse_functions/epf.h>
#include <assert.h>
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
#include <libregexis024sol/sol_misc_base.h>
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/special_terminals.h>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024sol/square_bracket_expression.h>
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
#include <libregexis024fa/misc_fa_funcs.h>
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
/* **************************** Sequence */
void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) {
assert(readChar(ctx) == U'\\');
int32_t leader = peep(ctx); aux_ERROR_CHECK;
if (leader == U'b'){
FA_NodeOfForking* n1 = fa.makeForking();
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1a, n2a);
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1b, n2b);
add_option_to_fork_node(n1, n1a);
add_option_to_fork_node(n1, n1b);
backPart.start = n1;
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
} else if (leader == U'B'){
FA_NodeOfForking* n1 = fa.makeForking();
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1a, n2a);
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1b, n2b);
add_option_to_fork_node(n1, n1a);
add_option_to_fork_node(n1, n1b);
backPart.start = n1;
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
} else if (leader == U'<'){
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1, n2);
backPart.start = n1;
backPart.ends = {&(n2->nxt_node)};
} else if (leader == U'>'){
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1, n2);
backPart.start = n1;
backPart.ends = {&(n2->nxt_node)};
} else {
bool ret_is_multicode; codeset_t res_codeset;
backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset);
backPart = subexpr_charset_reading_filter(res_codeset, fa);
return; // To avoid reading leader again (it gets read in the end)
}
readChar(ctx);
}
void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx,
SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){
if (min_allowed > max_allowed)
aux_THROW("repeat operation: min > max");
if (min_allowed > REGEXIS024_MAX_REPEAT)
aux_THROW("minimum repeat factor is too high");
if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty)
aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное "
"выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: "
"По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены.");
apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed);
}
void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector<SubExprCompiled>& parts,
const Command& cmd){
if (parts.empty())
aux_THROW("no subexpression before !repeat command");
if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) {
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK;
} else if (cmd.arguments.size() == 1){
size_t mm;
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK;
repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK;
} else if (cmd.arguments.size() > 2){
aux_THROW("too many arguments in !repeat command");
} else {
size_t min_allowed, max_allowed;
if (cmd.arguments[0].is_empty){
min_allowed = 0;
} else {
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT);
aux_ERROR_CHECK;
}
if (cmd.arguments[1].is_empty){
max_allowed = REGEXIS024_MAX_REPEAT + 1;
} else {
int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT);
aux_ERROR_CHECK;
}
if (min_allowed > max_allowed)
aux_THROW("!repeat: min > max");
repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK;
}
}
chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
while (true) {
int32_t fst = peep(ctx);
call_ERROR_CHECK;
if (fst == U'!') {
Command cmdBuf;
size_t before_cmd = ctx.pos;
cmdBuf = command_expr_parse(ctx);
call_ERROR_CHECK;
if (is_header_cmd(cmdBuf)){
ctx.pos = before_cmd;
break;
} else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){
repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK;
} else if (is_command_for_charset(cmdBuf)){
codeset_t cs;
interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK;
parts.push_back(subexpr_charset_reading_filter(cs, fa));
} else {
call_THROW("unknown command");
}
} else if (fst == U'\\') {
parts.emplace_back();
in_case_of_backslash(ctx, pctx.cc, fa, parts.back());
call_ERROR_CHECK;
} else if (fst == U'^'){
readChar(ctx);
parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n'))));
} else if (fst == U'$'){
readChar(ctx);
parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n'))));
} else if (fst == U'*'){
#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx);
vibe_check("*")
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
} else if (fst == U'+'){
vibe_check("+")
repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
} else if (fst == U'?'){
vibe_check("?")
repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK;
#undef vibe_check
} else if (fst == U'#'){
readChar(ctx);
std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
if (name.empty())
call_THROW("No name provided after #");
if (ctx.ktr.track_names.count(name) == 0){
ctx.ktr.track_names[name] = static_cast<int64_t>(ctx.ktr.retrieval_info.size());
ctx.ktr.retrieval_info.emplace_back();
}
int64_t id = ctx.ktr.track_names[name];
int32_t typeDet = peep(ctx);
if (typeDet == U'('){
ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK;
parts.emplace_back();
return std::make_unique<BracketLvl_ParseCall>(parts.back(), id);
} else if (typeDet == U':'){
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK;
readChar(ctx);
std::string value_str = tryRead_REGEX024_name(ctx);
size_t value;
int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX);
int32_t cl = peep(ctx);
if (cl != U';')
call_THROW("Missing ; after dot track unit operator");
readChar(ctx);
if (ctx.ktr.retrieval_info[id].stored_in_sa)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM,
ctx.ktr.retrieval_info[id].selarr_first, value)));
if (ctx.ktr.retrieval_info[id].stored_in_ca)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM,
ctx.ktr.retrieval_info[id].colarr_first, value)));
} else if (typeDet == U';'){
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK;
readChar(ctx);
if (ctx.ktr.retrieval_info[id].stored_in_sa)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS,
ctx.ktr.retrieval_info[id].selarr_first)));
if (ctx.ktr.retrieval_info[id].stored_in_ca)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS,
ctx.ktr.retrieval_info[id].colarr_first)));
} else
call_THROW("Missing ; or ( in the beginning of tracking unit");
} else if (fst == U'(') {
parts.emplace_back();
return std::make_unique<BracketLvl_ParseCall>(parts.back(), -1);
} else if (fst == U'[') {
codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK;
parts.push_back(subexpr_charset_reading_filter(filter, fa));
} else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){
readChar(ctx);
parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa));
} else {
break;
}
}
for (SubExprCompiled& part: parts)
result = join(result, part);
return NULL;
}
chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
// This is possible only if I received a bracket expression
return firstTime(ctx, pctx, fa);
}