First version
This commit is contained in:
commit
b4381e9238
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
@ -0,0 +1,11 @@
|
||||
# Never use CMAKE in production
|
||||
CMakeLists.txt
|
||||
cmake-build-debug/
|
||||
# Output of build system
|
||||
built/
|
||||
# This is a compilated build system script
|
||||
building/main
|
||||
building/*.png
|
||||
building/*.svg
|
||||
|
||||
.idea/
|
8
README.txt
Normal file
8
README.txt
Normal file
@ -0,0 +1,8 @@
|
||||
libregexis024
|
||||
Library for Regular Expressions, implementation of summer 2024
|
||||
|
||||
libregexis024vm
|
||||
Provides only means of configuration and running my regexp virtual machine bytecode
|
||||
|
||||
libgreexpis024sol
|
||||
Provides functions to compile regular expression into libregexis024 virual machine bytecode
|
9
building/build_build_system.sh
Executable file
9
building/build_build_system.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/bin/sh
|
||||
|
||||
BUILDING_DIR="./building"
|
||||
[ -d "$BUILDING_DIR" ] || exit 1
|
||||
MAIN_FILE="$BUILDING_DIR/main.cpp"
|
||||
[ -f "$MAIN_FILE" ] || exit 1
|
||||
COOL_FLAGS="$(pkg-config --cflags regexis024-build-system)"
|
||||
|
||||
g++ $COOL_FLAGS -o "$BUILDING_DIR/main" "$MAIN_FILE" || exit 1
|
156
building/main.cpp
Normal file
156
building/main.cpp
Normal file
@ -0,0 +1,156 @@
|
||||
#include <regexis024_build_system.h>
|
||||
|
||||
/*
|
||||
* LIBREGEXIS024 SPECIFIC BUILD COMMANDS BEGIN
|
||||
*/
|
||||
|
||||
struct Libregexis024BuildSystem {
|
||||
/* Building runlevel */
|
||||
BuildUnitsArray runlevel_1;
|
||||
/* Installation runlevel */
|
||||
BuildUnitsArray runlevel_2;
|
||||
|
||||
/* "debug" or "release" */
|
||||
std::string build_type;
|
||||
bool build_tests = false;
|
||||
|
||||
std::vector<std::string> warning_flags = {"-Wall", "-Wno-unused-variable", "-Werror=return-type","-pedantic",
|
||||
"-Wno-unused-but-set-variable", "-Wno-reorder"};
|
||||
std::vector<std::string> version_flags = {"--std", "c++14", "-D", "_POSIX_C_SOURCE=200809L"};
|
||||
|
||||
std::vector<std::string> debug_defines_release = {"_GLIBCXX_DEBUG"};
|
||||
std::vector<std::string> debug_defines_debug = {"_GLIBCXX_DEBUG", "LIBREGEXIS024_DEBUG", "LIBREGEXIS024_ALLOW_LOUD"};
|
||||
std::vector<std::string> opt_flags_release = {"-g", "-O2"};
|
||||
std::vector<std::string> opt_flags_debug = {"-g", "-ggdb", "-O0"};
|
||||
|
||||
std::vector<std::string> getSomeRadFlags() {
|
||||
std::vector<std::string> my_flag_collection;
|
||||
gxx_add_cli_options(my_flag_collection, warning_flags);
|
||||
gxx_add_cli_options(my_flag_collection, version_flags);
|
||||
if (build_type == "release") {
|
||||
gxx_add_cli_defines(my_flag_collection, debug_defines_release);
|
||||
gxx_add_cli_options(my_flag_collection, opt_flags_release);
|
||||
} else if (build_type == "debug") {
|
||||
gxx_add_cli_defines(my_flag_collection, debug_defines_debug);
|
||||
gxx_add_cli_options(my_flag_collection, opt_flags_debug);
|
||||
}
|
||||
return my_flag_collection;
|
||||
}
|
||||
|
||||
Libregexis024BuildSystem(const std::string& build_type, const NormalCBuildSystemCommandMeaning& cmd)
|
||||
:build_type(build_type)
|
||||
{
|
||||
ASSERT(build_type == "release" || build_type == "debug", "Unknown build type");
|
||||
|
||||
std::vector<ExternalLibraryTarget> ext_targets;
|
||||
|
||||
std::vector<CTarget> my_targets;
|
||||
{
|
||||
std::vector<std::string> compilation_units_release = {
|
||||
"libregexis024vm/utils.cpp",
|
||||
"libregexis024vm/vm_errno.cpp",
|
||||
"libregexis024vm/vm_opcodes_disassembly.cpp",
|
||||
"libregexis024vm/libregexis024vm_interface.cpp",
|
||||
"libregexis024vm/libregexis024vm_disassembly.cpp",
|
||||
"libregexis024vm/libregexis024vm_context.cpp",
|
||||
"libregexis024vm/instruction_implementation.cpp",
|
||||
"libregexis024vm/libregex024opcodes_stringification.cpp",
|
||||
|
||||
"libregexis024fa/codeset.cpp",
|
||||
"libregexis024fa/colored_codeset.cpp",
|
||||
"libregexis024fa/fa_first_stage_fix.cpp",
|
||||
"libregexis024fa/finite_automaton.cpp",
|
||||
"libregexis024fa/misc_fa_funcs.cpp",
|
||||
"libregexis024fa/selarr_priority_table.cpp",
|
||||
"libregexis024fa/tracking_fa_nodes.cpp",
|
||||
"libregexis024fa/fa_make_deterministic.cpp",
|
||||
|
||||
"libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp",
|
||||
"libregexis024fa/graph_to_bytecode/writing_commands.cpp",
|
||||
"libregexis024fa/graph_to_bytecode/filter.cpp",
|
||||
"libregexis024fa/graph_to_bytecode/fa_compiler.cpp",
|
||||
"libregexis024fa/graph_to_bytecode/core.cpp",
|
||||
|
||||
"libregexis024sol/common_codesets.cpp",
|
||||
"libregexis024sol/part_of_expr_that_tracks.cpp",
|
||||
"libregexis024sol/expr_compiler.cpp",
|
||||
"libregexis024sol/square_bracket_expression.cpp",
|
||||
"libregexis024sol/sol_misc_base.cpp",
|
||||
"libregexis024sol/command_expression.cpp",
|
||||
"libregexis024sol/backslash_expression.cpp",
|
||||
"libregexis024sol/subexpr_fa_transformed.cpp",
|
||||
"libregexis024sol/expr_parse_functions/tracking_units.cpp",
|
||||
"libregexis024sol/expr_parse_functions/ep_sequence.cpp",
|
||||
"libregexis024sol/expr_parse_functions/command_recognition.cpp",
|
||||
|
||||
"libregexis024tools/stringmatching.cpp",
|
||||
};
|
||||
|
||||
/* These are added to compilation_units_of_release */
|
||||
std::vector<std::string> additional_compilation_units_debug = {
|
||||
"debugging_regexis024/prettyprint/prettyprint_util.cpp",
|
||||
"debugging_regexis024/vm/libregexis024vm_debug.cpp",
|
||||
"debugging_regexis024/debug_through_graphviz.cpp",
|
||||
};
|
||||
|
||||
/* Suitable forr both release and debug (even though you will pretty much never need to export headers of build of
|
||||
* debug build type */
|
||||
std::vector<std::string> exported_headers = {
|
||||
"libregexis024vm/vm_errno.h",
|
||||
"libregexis024vm/vm_opcodes_types.h",
|
||||
"libregexis024vm/vm_opcodes.h",
|
||||
"libregexis024vm/libregexis024vm_interface.h",
|
||||
|
||||
"libregexis024fa/tracking_variables.h",
|
||||
|
||||
"libregexis024sol/part_of_expr_that_tracks.h",
|
||||
"libregexis024sol/expr_compiler.h",
|
||||
|
||||
"libregexis024tools/stringmatching.h",
|
||||
};
|
||||
|
||||
CTarget T("libregexis024", "shared_library");
|
||||
T.additional_compilation_flags = getSomeRadFlags();
|
||||
array_concat(T.units, compilation_units_release);
|
||||
if (build_type == "debug")
|
||||
array_concat(T.units, additional_compilation_units_debug);
|
||||
T.include_pr = "";
|
||||
T.include_ir = "";
|
||||
T.exported_headers = exported_headers;
|
||||
T.installation_dir = "";
|
||||
T.pc_output_path = "libregexis024.pc";
|
||||
my_targets.push_back(T);
|
||||
}
|
||||
if (build_tests) {
|
||||
CTarget T("libregexis024_test4", "executable");
|
||||
T.additional_compilation_flags = getSomeRadFlags();
|
||||
T.proj_deps = {CTargetDependenceOnProjectsLibrary("libregexis024")};
|
||||
T.units = {"libregexis024test/test4.cpp"};
|
||||
my_targets.push_back(T);
|
||||
}
|
||||
|
||||
regular_ctargets_to_2bus_conversion(ext_targets, my_targets, runlevel_1, runlevel_2,
|
||||
cmd.project_root, cmd.installation_root);
|
||||
}
|
||||
};
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
try {
|
||||
assert(argc > 0);
|
||||
std::vector<std::string> args(argc - 1);
|
||||
for (int i = 0; i + 1 < argc; i++) {
|
||||
args[i] = argv[i + 1];
|
||||
}
|
||||
NormalCBuildSystemCommandMeaning cmd;
|
||||
regular_bs_cli_cmd_interpret(args, cmd);
|
||||
Libregexis024BuildSystem bs("debug", cmd);
|
||||
show_build_units_array_with_image_viewer(bs.runlevel_1, "true");
|
||||
show_build_units_array_with_image_viewer(bs.runlevel_2, "true");
|
||||
if (cmd.need_to_build)
|
||||
complete_tasks_of_build_units(bs.runlevel_1);
|
||||
if (cmd.need_to_install)
|
||||
complete_tasks_of_build_units(bs.runlevel_2);
|
||||
} catch (const buildSystemFailure& e) {
|
||||
printf("Build system failure\n""%s\n", e.toString().c_str());
|
||||
}
|
||||
}
|
325
src/debugging_regexis024/debug_through_graphviz.cpp
Normal file
325
src/debugging_regexis024/debug_through_graphviz.cpp
Normal file
@ -0,0 +1,325 @@
|
||||
#include <debugging_regexis024/debug_through_graphviz.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <wait.h>
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024fa/tracking_fa_nodes.h>
|
||||
|
||||
const char* one_char_read_color = "black";
|
||||
const char* forking_color = "darkorchid1";
|
||||
const char* look_one_behind_color = "darkslateblue";
|
||||
const char* look_one_ahead_color = "coral1";
|
||||
const char* track_array_mov_imm_color = "lightblue2";
|
||||
const char* track_array_mov_halfinvariant_color = "lightseagreen";
|
||||
const char* match_pending_lob_color = "darkgoldenrod2";
|
||||
const char* match_color = "gold";
|
||||
const char* det_char_crossroads_color = "navy";
|
||||
const char* error_color = "crimson";
|
||||
const char* STAR = "★";
|
||||
|
||||
const char* get_associated_color(FA_Node* node){
|
||||
switch (node->type) {
|
||||
#define ccase(tn) case tn: return tn##_color;
|
||||
ccase(one_char_read)
|
||||
ccase(forking)
|
||||
ccase(look_one_behind)
|
||||
ccase(look_one_ahead)
|
||||
ccase(track_array_mov_imm)
|
||||
ccase(track_array_mov_halfinvariant)
|
||||
ccase(det_char_crossroads)
|
||||
case match:
|
||||
return dynamic_cast<FA_NodeOfMatch*>(node)->ext_filter_added ? match_pending_lob_color : match_color;
|
||||
default:
|
||||
return "black";
|
||||
#undef ccase
|
||||
}
|
||||
}
|
||||
|
||||
struct NodesProblems{
|
||||
size_t actual_refcount = 0;
|
||||
bool refcount_problem = false;
|
||||
size_t edges_point_to_null = 0;
|
||||
};
|
||||
|
||||
struct EdgesProblems {
|
||||
bool points_to_null = false;
|
||||
explicit EdgesProblems(bool points_to_null): points_to_null(points_to_null) {}
|
||||
};
|
||||
|
||||
std::string get_applied_edge_attributes(FA_Node* node, const NodesProblems& np, const EdgesProblems& ep){
|
||||
std::string res = "color=";
|
||||
if (ep.points_to_null) {
|
||||
res += error_color;
|
||||
} else {
|
||||
res += get_associated_color(node);
|
||||
if (node->type == one_char_read || node->type == det_char_crossroads)
|
||||
res += " style=bold";
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string get_applied_node_attributes(FA_Node* node, const NodesProblems& bd){
|
||||
std::string res = "color=";
|
||||
res += get_associated_color(node);
|
||||
if (bd.refcount_problem)
|
||||
res += " fontcolor=crimson";
|
||||
if ((node->type == match) ||
|
||||
(node->type == det_char_crossroads && dynamic_cast<FA_NodeOfDetCharCrossroads*>(node)->matching))
|
||||
res += " shape=doublecircle";
|
||||
return res;
|
||||
}
|
||||
|
||||
void append_reverse_hex(std::string& res, uint32_t num){
|
||||
if (num == 0){
|
||||
res += "0";
|
||||
} else {
|
||||
while (num){
|
||||
uint32_t r = num & 0x0F;
|
||||
res += static_cast<char>((r < 10) ? (r + '0') : (r - 10 + 'a'));
|
||||
num >>= 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string stringify_codeset(const codeset_t& cs){
|
||||
std::string res;
|
||||
for (long i = static_cast<long>(cs.size()) - 1; i >= 0; i--) {
|
||||
uint64_t start = cs[i].first, end = cs[i].second;
|
||||
if (start == end) {
|
||||
append_reverse_hex(res, start);
|
||||
} else {
|
||||
append_reverse_hex(res, end);
|
||||
res += '-';
|
||||
append_reverse_hex(res, start);
|
||||
}
|
||||
if (i != 0)
|
||||
res += ',';
|
||||
}
|
||||
std::reverse(res.begin(), res.end()); /* ascii works wonders */
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string get_extended_node_lable(FA_Node* node){
|
||||
if ((node->type == one_char_read && dynamic_cast<FA_NodeOfOneCharRead*>(node)->second_ns) ||
|
||||
(node->type == det_char_crossroads && dynamic_cast<FA_NodeOfDetCharCrossroads*>(node)->second_ns)) {
|
||||
return std::string(" ") + STAR;
|
||||
}
|
||||
if (node->type == match) {
|
||||
FA_NodeOfMatch* mn = static_cast<FA_NodeOfMatch*>(node);
|
||||
if (mn->ext_filter_added)
|
||||
return std::string(" pending loa ") + stringify_codeset(mn->pending_filter);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string get_node_lable(FA_Node* node, const NodesProblems& bd){
|
||||
std::string res;
|
||||
switch (node->type) {
|
||||
#define tcase(tn, str) case tn: res = str; break;
|
||||
tcase(one_char_read, "ocr")
|
||||
tcase(match, "m")
|
||||
tcase(forking, "f")
|
||||
tcase(look_one_behind, "lob")
|
||||
tcase(look_one_ahead, "loa")
|
||||
tcase(track_array_mov_imm, "tami")
|
||||
tcase(track_array_mov_halfinvariant, "tamh")
|
||||
tcase(det_char_crossroads, "dcc")
|
||||
}
|
||||
res += ("[" + std::to_string(node->nodeId) + "]");
|
||||
res += get_extended_node_lable(node);
|
||||
if (bd.refcount_problem)
|
||||
res += ("!refcount: " + std::to_string(node->refs) + "!");
|
||||
return res;
|
||||
}
|
||||
|
||||
void print_edge(FA_Node* start, const FA_Node* dest, const std::string& label, FILE* fd, NodesProblems& np){
|
||||
if (!dest){
|
||||
fprintf(stderr, "NULL transition going from node %lu\n", start->nodeId);
|
||||
fprintf(fd, "%lu->NULL_%lu_%lu [label=\"%s\" color=crimson]", start->nodeId,
|
||||
start->nodeId, np.edges_point_to_null++, label.c_str());
|
||||
return;
|
||||
}
|
||||
fprintf(fd, "%lu->%lu [label=\"%s\" %s]\n", start->nodeId, dest->nodeId, label.c_str(),
|
||||
get_applied_edge_attributes(start, np, EdgesProblems(false)).c_str());
|
||||
}
|
||||
|
||||
void print_fa(const FA_Container& fa, FILE* fd, const KnownTrackingTools& ktr,
|
||||
const RegexPriorityTable& priority_table){
|
||||
assert(fa.start);
|
||||
assert(fd);
|
||||
fprintf(fd, "digraph finite_automaton {\ngraph ["
|
||||
"fontname = \"Helvetica\" charset = \"UTF-8\" label = \"Finite Automaton\" labelloc = \"t\" labeljust = \"c\" "
|
||||
"bgcolor = \"#FFFAF4\" fontcolor = black fontsize = 18 style = \"filled\" rankdir = LR margin = 0.2 "
|
||||
"splines = spline nodesep = 0.9 ranksep = 1.2 ]\n node [ style = \"solid,filled\" fontsize = 15 "
|
||||
"fontcolor = black fontname = \"Helvetica\" color = black fillcolor = white margin = \"0.2,0.2\" shape=circle "
|
||||
"]\n edge [ style = solid fontsize = 16 fontcolor = black fontname = \"Helvetica\" color = black "
|
||||
"labelfloat = false labeldistance = 2.5 labelangle = 70 arrowhead = normal ]\n"
|
||||
"start_state [label = \"start\\nfrom\\nhere\" shape=none style=\"\" ]\n");
|
||||
|
||||
size_t n = fa.all.size();
|
||||
std::vector<NodesProblems> breakdown;
|
||||
breakdown.resize(n);
|
||||
breakdown[fa.start->nodeId].actual_refcount++;
|
||||
for (size_t i = 0; i < n; i++){
|
||||
assert(fa.all[i]->nodeId == static_cast<int64_t>(i));
|
||||
for (FA_Node** nxtN: fa.all[i]->get_all_transitions())
|
||||
if ((*nxtN) != NULL)
|
||||
breakdown[(**nxtN).nodeId].actual_refcount++;
|
||||
}
|
||||
for (size_t i = 0; i < n; i++){
|
||||
if (fa.all[i]->refs != breakdown[i].actual_refcount){
|
||||
breakdown[i].refcount_problem = true;
|
||||
fprintf(stderr, "Corrupted FA: wrong refcount on node %lu\n", fa.all[i]->nodeId);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < n; i++){
|
||||
fprintf(fd, "%lu [label=\"%s\" %s]\n", i, get_node_lable(fa.all[i], breakdown[i]).c_str(),
|
||||
get_applied_node_attributes(fa.all[i], breakdown[i]).c_str());
|
||||
}
|
||||
|
||||
/* Two Infoboxes */
|
||||
|
||||
auto stringifyTrackingVarType = [](tracking_var_type type) -> std::string {
|
||||
switch (type) {
|
||||
case tracking_var_types::range:
|
||||
return "range";
|
||||
case tracking_var_types::dot_cur_pos:
|
||||
return "dot of cur pos";
|
||||
default:
|
||||
return "dot of immediate";
|
||||
}
|
||||
};
|
||||
|
||||
std::string infoText;
|
||||
for (auto& p: ktr.track_names){
|
||||
const SubtrackingNameInfo& tu = ktr.retrieval_info[p.second];
|
||||
|
||||
auto getRole = [](bool presence, tracking_var_type type, int first, int second,
|
||||
const std::string& ARR_NAME) -> std::string {
|
||||
if (!presence) {
|
||||
assert(first == -1 && second == -1);
|
||||
return "Not involved in " + ARR_NAME;
|
||||
}
|
||||
if (type == tracking_var_types::range){
|
||||
assert(first != -1 && second != -1);
|
||||
return "In " + ARR_NAME + ": " + std::to_string(first) + " <−> " + std::to_string(second);
|
||||
}
|
||||
assert(first != -1 && second == -1);
|
||||
return "In " + ARR_NAME + ": ( " + std::to_string(first) + " )";
|
||||
};
|
||||
char buf[2048] = {0};
|
||||
snprintf(buf, 2048, "Tracking unit name: %s\\n" "Discovered: %s\\n" "Type: %s\\n" "%s\\n%s",
|
||||
p.first.c_str(), tu.discovered ? "ofcourse" : "no",
|
||||
stringifyTrackingVarType(tu.type).c_str(),
|
||||
getRole(tu.stored_in_ca, tu.type, tu.colarr_first, tu.colarr_second, "colarr").c_str(),
|
||||
getRole(tu.stored_in_sa, tu.type, tu.selarr_first, tu.selarr_second, "selarr").c_str());
|
||||
if (!infoText.empty())
|
||||
infoText += "|";
|
||||
infoText += buf;
|
||||
}
|
||||
fprintf(fd, "infoBoard1 [label=\"%s\" shape = record]\n", infoText.c_str());
|
||||
infoText = "";
|
||||
for (size_t i = 0; i < priority_table.size(); i++){
|
||||
const RegexPriorityTableAction& tu = priority_table[i];
|
||||
if (!infoText.empty())
|
||||
infoText += "|";
|
||||
infoText += tu.minimize ? "Minimize " : "Maximize ";
|
||||
if (tu.pos.isForRange()){
|
||||
infoText += "[" + std::to_string(tu.pos.second) + "] - [" + std::to_string(tu.pos.first) + "]";
|
||||
} else {
|
||||
infoText += "[" + std::to_string(tu.pos.first) + "]";
|
||||
}
|
||||
}
|
||||
fprintf(fd, "infoBoard2 [label=\"%s\" shape = record]\n", infoText.c_str());
|
||||
|
||||
assert(fa.start);
|
||||
fprintf(fd, "start_state->%lu [color=gray style=dotted]\n", fa.start->nodeId);
|
||||
|
||||
|
||||
for (FA_Node* node: fa.all){
|
||||
NodesProblems& bd = breakdown[node->nodeId];
|
||||
if (node->type == one_char_read){
|
||||
FA_NodeOfOneCharRead* cn = dynamic_cast<FA_NodeOfOneCharRead *>(node);
|
||||
std::string str = stringify_codeset(cn->filter);
|
||||
print_edge(node, cn->nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), fd, bd);
|
||||
} else if (node->type == forking){
|
||||
FA_NodeOfForking* cn = dynamic_cast<FA_NodeOfForking *>(node);
|
||||
for (FA_Node* nxt: cn->nxt_options){
|
||||
print_edge(node, nxt, "", fd, bd);
|
||||
}
|
||||
} else if (node->type == look_one_behind){
|
||||
FA_NodeOfLookOneBehind* cn = dynamic_cast<FA_NodeOfLookOneBehind *>(node);
|
||||
print_edge(node, cn->nxt_node, stringify_codeset(cn->filter), fd, bd);
|
||||
} else if (node->type == look_one_ahead){
|
||||
FA_NodeOfLookOneAhead* cn = dynamic_cast<FA_NodeOfLookOneAhead *>(node);
|
||||
print_edge(node, cn->nxt_node, stringify_codeset(cn->restriction), fd, bd);
|
||||
} else if (node->type == track_array_mov_imm){
|
||||
FA_NodeOfTrackArrayMovImm* cn = dynamic_cast<FA_NodeOfTrackArrayMovImm *>(node);
|
||||
char buf[1024];
|
||||
if (!isImmMovOpcode(cn->operation))
|
||||
fprintf(stderr, "bad operation in node %lu\n", node->nodeId);
|
||||
snprintf(buf, 1024, "%s %hu %lu",
|
||||
regex024_opcode_tostr(cn->operation), cn->key, cn->imm_value);
|
||||
print_edge(node, cn->nxt_node,std::string(buf), fd, bd);
|
||||
} else if (node->type == track_array_mov_halfinvariant){
|
||||
FA_NodeOfTrackArrayMovHalfinvariant* cn = dynamic_cast<FA_NodeOfTrackArrayMovHalfinvariant *>(node);
|
||||
char buf[1024];
|
||||
if (!isCurPosMovOpcode(cn->operation))
|
||||
fprintf(stderr, "bad operation in node %lu\n", node->nodeId);
|
||||
snprintf(buf, 1024, "%s %hu",
|
||||
regex024_opcode_tostr(cn->operation), cn->key);
|
||||
print_edge(node, cn->nxt_node,std::string(buf), fd, bd);
|
||||
} else if (node->type == det_char_crossroads){
|
||||
FA_NodeOfDetCharCrossroads* cn = dynamic_cast<FA_NodeOfDetCharCrossroads *>(node);
|
||||
for (const auto& transition: cn->crossroads){
|
||||
std::string str = stringify_codeset(transition.input);
|
||||
print_edge(node, transition.nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""),
|
||||
fd, bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(fd, "}\n");
|
||||
}
|
||||
|
||||
FILE* get_fd(const char* apath){
|
||||
errno = 0;
|
||||
FILE *fd = fopen(apath, "w");
|
||||
if (!fd)
|
||||
perror("fopen w");
|
||||
if (ftruncate(fileno(fd), 0) != 0)
|
||||
perror("truncation");
|
||||
fd = fopen(apath, "a");
|
||||
if (!fd)
|
||||
perror("fopen a");
|
||||
return fd;
|
||||
}
|
||||
|
||||
void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr,
|
||||
const RegexPriorityTable& priority_table) {
|
||||
const char* temp_gv = "FAGraph.gv";
|
||||
const char* temp_png = "FAGraph.png";
|
||||
int temp_descriptor = open(temp_gv, O_CLOEXEC | O_APPEND | O_CREAT | O_WRONLY, S_IRWXU | S_IRWXG);
|
||||
assert(temp_descriptor >= 0);
|
||||
assert(fa.start);
|
||||
FILE* fd = get_fd(temp_gv);
|
||||
print_fa(fa, fd, ktr, priority_table);
|
||||
fclose(fd);
|
||||
char cmdBuf[1024];
|
||||
// todo: get rid of temporary dot file and shell usage
|
||||
snprintf(cmdBuf, 1024, "dot %s -Tpng >%s", temp_gv, temp_png);
|
||||
int chw = system(cmdBuf);
|
||||
assert(WIFEXITED(chw));
|
||||
assert(WEXITSTATUS(chw) == 0);
|
||||
snprintf(cmdBuf, 1024, "sxiv %s", temp_png);
|
||||
chw = system(cmdBuf);
|
||||
assert(WIFEXITED(chw));
|
||||
assert(WEXITSTATUS(chw) == 0);
|
||||
assert(chw >= 0);
|
||||
unlink(temp_gv);
|
||||
unlink(temp_png);
|
||||
}
|
12
src/debugging_regexis024/debug_through_graphviz.h
Normal file
12
src/debugging_regexis024/debug_through_graphviz.h
Normal file
@ -0,0 +1,12 @@
|
||||
#ifndef DEBUGGING_REGEXIS024_DEBUG_THROUGH_GRAPHVIZ_H
|
||||
#define DEBUGGING_REGEXIS024_DEBUG_THROUGH_GRAPHVIZ_H
|
||||
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <libregexis024sol/part_of_expr_that_tracks.h>
|
||||
#include <libregexis024fa/selarr_priority_table.h>
|
||||
|
||||
/* Uses temporary file FAGraph.gv,png, dot command and sxiv */
|
||||
void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr,
|
||||
const RegexPriorityTable& priority_table);
|
||||
|
||||
#endif
|
89
src/debugging_regexis024/prettyprint/prettyprint_util.cpp
Normal file
89
src/debugging_regexis024/prettyprint/prettyprint_util.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
#include <debugging_regexis024/prettyprint/prettyprint_util.h>
|
||||
#include <functional>
|
||||
#include <libregexis024vm/utils.h>
|
||||
|
||||
TreeWithStringsNode::TreeWithStringsNode(const std::string &val): val(val) {
|
||||
}
|
||||
|
||||
static const char* ch_empty = " ";
|
||||
static const char* ch_passing_by = "\u2502 ";
|
||||
static const char* ch_connect_right_and_forward = "\u251c\u2500\u2500\u2500";
|
||||
static const char* ch_connect_right_last = "\u2514\u2500\u2500\u2500";
|
||||
|
||||
static const char* ch_box_left_side = "\u2551";
|
||||
static const char* ch_box_right_side = "\u2551";
|
||||
static const char* ch_box_top_side = "\u2550";
|
||||
static const char* ch_box_bottom_side = "\u2550";
|
||||
static const char* ch_box_crn_top_left = "\u2554";
|
||||
static const char* ch_box_crn_top_right = "\u2557";
|
||||
static const char* ch_box_crn_bottom_left = "\u255A";
|
||||
static const char* ch_box_crn_bottom_right = "\u255D";
|
||||
|
||||
size_t length_of_line(const std::string& str) {
|
||||
size_t ch = 0;
|
||||
size_t pos = 0;
|
||||
while (pos < str.size()) {
|
||||
int32_t code;
|
||||
size_t adj;
|
||||
utf8_string_iterat(code, adj, pos, reinterpret_cast<const uint8_t*>(str.data()), str.size());
|
||||
if (code < 0)
|
||||
return ch;
|
||||
ch++;
|
||||
pos += adj;
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
/* Warning: recursion used */
|
||||
void toLines_dfs(const TreeWithStringsNode& node, lines& out, std::vector<bool>& prefix) {
|
||||
out.push_back("");
|
||||
size_t n = prefix.size();
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
if (i + 1 < n) {
|
||||
out.back() += prefix[i] ? ch_passing_by : ch_empty;
|
||||
} else {
|
||||
out.back() += prefix[i] ? ch_connect_right_and_forward : ch_connect_right_last;
|
||||
}
|
||||
}
|
||||
out.back() += node.val;
|
||||
prefix.push_back(true);
|
||||
size_t m = node.childeren.size();
|
||||
for (size_t i = 0; i < m; i++) {
|
||||
if (i + 1 == m)
|
||||
prefix[n] = false;
|
||||
toLines_dfs(node.childeren[i], out, prefix);
|
||||
}
|
||||
prefix.pop_back();
|
||||
}
|
||||
|
||||
void TreeWithStringsNode::toLines(lines &out) const {
|
||||
std::vector<bool> prefix;
|
||||
toLines_dfs(*this, out, prefix);
|
||||
}
|
||||
|
||||
std::string strMul(size_t n, const char* str) {
|
||||
std::string res;
|
||||
for (size_t i = 0; i < n; i++)
|
||||
res += str;
|
||||
return res;
|
||||
}
|
||||
|
||||
lines wrapWithBox(const lines &in) {
|
||||
lines out;
|
||||
size_t max_width = 0;
|
||||
for (auto& l: in)
|
||||
max_width = std::max(max_width, length_of_line(l));
|
||||
out.push_back(ch_box_crn_top_left + strMul(max_width, ch_box_top_side) + ch_box_crn_top_right);
|
||||
for (auto& line: in) {
|
||||
size_t s = length_of_line(line);
|
||||
out.push_back(ch_box_left_side + line + strMul(max_width - s, " ") + ch_box_right_side);
|
||||
}
|
||||
out.push_back(ch_box_crn_bottom_left + strMul(max_width, ch_box_bottom_side) + ch_box_crn_bottom_right);
|
||||
return out;
|
||||
}
|
||||
|
||||
void printLines(const lines &in) {
|
||||
for (auto& l: in)
|
||||
printf("%s\n", l.c_str());
|
||||
}
|
||||
|
25
src/debugging_regexis024/prettyprint/prettyprint_util.h
Normal file
25
src/debugging_regexis024/prettyprint/prettyprint_util.h
Normal file
@ -0,0 +1,25 @@
|
||||
#ifndef DEBUGGING_REGEXIS024_PRETTYPRINT_UTIL_H
|
||||
#define DEBUGGING_REGEXIS024_PRETTYPRINT_UTIL_H
|
||||
|
||||
/* Used for debug. Do not give to user */
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
typedef std::vector<std::string> lines;
|
||||
|
||||
struct TreeWithStringsNode {
|
||||
std::string val;
|
||||
std::vector<TreeWithStringsNode> childeren;
|
||||
|
||||
explicit TreeWithStringsNode(const std::string &val);
|
||||
TreeWithStringsNode() = default;
|
||||
|
||||
void toLines(lines& out) const;
|
||||
};
|
||||
|
||||
lines wrapWithBox(const lines& in);
|
||||
|
||||
void printLines(const lines& in);
|
||||
|
||||
#endif
|
58
src/debugging_regexis024/vm/libregexis024vm_debug.cpp
Normal file
58
src/debugging_regexis024/vm/libregexis024vm_debug.cpp
Normal file
@ -0,0 +1,58 @@
|
||||
#include <debugging_regexis024/vm/libregexis024vm_debug.h>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
|
||||
std::string thread_to_str(const REGEX_IS024_Thread& thread){
|
||||
if (!(thread.slot_occupation_status & SLOT_OCCUPIED))
|
||||
return "{ unoccupied }";
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "{ IP = %lu }", thread.IP);
|
||||
return buf;
|
||||
}
|
||||
|
||||
std::string stack_to_str(const REGEX_IS024_Stack& stack){
|
||||
std::string res = "{ ";
|
||||
for (uint32_t i = 0; i < stack.sz; i++){
|
||||
if (i != 0)
|
||||
res += ", ";
|
||||
res += std::to_string(stack.slots[i]);
|
||||
}
|
||||
res += " }";
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string slots_to_str(const REGEX_IS024_CONTEXT& ctx){
|
||||
if (!ctx.initialized)
|
||||
return "uninitialized";
|
||||
std::string READ_slots;
|
||||
for (size_t i = 0; i < ctx.read_slots_number; i++){
|
||||
uint8_t stat = ctx.READ_halted_slots[i].slot_occupation_status;
|
||||
READ_slots += (stat & SLOT_OCCUPIED) ? ((stat & SLOT_NEW) ? "N" : "O") : "x";
|
||||
}
|
||||
std::string FORK_slots;
|
||||
for (size_t i = 0; i < ctx.fork_slots_number; i++){
|
||||
uint8_t stat = ctx.FORK_halted_slots[i].slot_occupation_status;
|
||||
FORK_slots += (stat & SLOT_OCCUPIED) ? "O" : "x";
|
||||
}
|
||||
char buf[4096];
|
||||
snprintf(buf, 4096, "READ_slots: %s ; FORK_slots: %s ; READ_stack_new_main: %s ; "
|
||||
"READ_stack_new_second: %s ; READ_stack_old: %s ; FORK_stack: %s",
|
||||
READ_slots.c_str(), FORK_slots.c_str(), stack_to_str(ctx.READ_halted_stack_new_first).c_str(),
|
||||
stack_to_str(ctx.READ_halted_stack_new_second).c_str(),
|
||||
stack_to_str(ctx.READ_halted_stack_old).c_str(), stack_to_str(ctx.FORK_halted_stack).c_str());
|
||||
return buf;
|
||||
}
|
||||
|
||||
void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place) {
|
||||
printf("== DEBUG `%s` ==\n", place);
|
||||
|
||||
printf("Active thread: %s, sifting_with: %s, match: %s\n%s\n",
|
||||
thread_to_str(ctx.active_thread).c_str(),
|
||||
ctx.sifting_with ? thread_to_str(*ctx.sifting_with).c_str() : "NO", thread_to_str(ctx.matched_thread).c_str(),
|
||||
slots_to_str(ctx).c_str());
|
||||
}
|
||||
|
||||
void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place) {
|
||||
printf("== DEBUG `%s` ==\n", place);
|
||||
printf("This thread: %s\n", thread_to_str(thr).c_str());
|
||||
}
|
11
src/debugging_regexis024/vm/libregexis024vm_debug.h
Normal file
11
src/debugging_regexis024/vm/libregexis024vm_debug.h
Normal file
@ -0,0 +1,11 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024VM_DEBUG_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024VM_DEBUG_H
|
||||
|
||||
#include <libregexis024vm/libregexis024vm.h>
|
||||
#include <libregexis024vm/instruction_implementation.h>
|
||||
|
||||
void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place);
|
||||
|
||||
void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place);
|
||||
|
||||
#endif
|
120
src/libregexis024fa/codeset.cpp
Normal file
120
src/libregexis024fa/codeset.cpp
Normal file
@ -0,0 +1,120 @@
|
||||
#include <libregexis024fa/codeset.h>
|
||||
#include <assert.h>
|
||||
|
||||
codeset_t invert_set(const codeset_t &X) {
|
||||
if (X.empty())
|
||||
return {{0, UINT32_MAX}};
|
||||
codeset_t res;
|
||||
if (X[0].first != 0)
|
||||
res.emplace_back(0, X[0].first - 1);
|
||||
for (size_t i = 0; i + 1 < X.size(); i++){
|
||||
res.emplace_back(X[i].second + 1, X[i + 1].first - 1);
|
||||
}
|
||||
if (X.back().second != UINT32_MAX)
|
||||
res.emplace_back(X.back().second + 1, UINT32_MAX);
|
||||
return res;
|
||||
}
|
||||
|
||||
#define elA (A[i])
|
||||
#define elB (B[j])
|
||||
#define Ainc i++
|
||||
#define Binc j++
|
||||
#define prepare size_t An = A.size(); size_t Bn = B.size(); size_t i = 0; size_t j = 0;
|
||||
#define Aended (i == An)
|
||||
#define Bended (j == Bn)
|
||||
|
||||
codeset_t merge_sets(const codeset_t &A, const codeset_t &B) {
|
||||
codeset_t res;
|
||||
prepare
|
||||
std::pair<uint32_t, uint32_t> cur;
|
||||
while (true){
|
||||
if (Aended && Bended)
|
||||
break;
|
||||
if (i == An){
|
||||
cur = elB;
|
||||
Binc;
|
||||
} else if (j == Bn){
|
||||
cur = elA;
|
||||
Ainc;
|
||||
} else {
|
||||
if (elA.first < elB.first) {
|
||||
cur = elA;
|
||||
Ainc;
|
||||
} else {
|
||||
cur = elB;
|
||||
Binc;
|
||||
}
|
||||
}
|
||||
while (true){
|
||||
if (Aended && Bended){
|
||||
res.push_back(cur);
|
||||
break;
|
||||
}
|
||||
if (i < An && (cur.second == UINT32_MAX || elA.first <= cur.second + 1)){
|
||||
cur.second = std::max(elA.second, cur.second);
|
||||
Ainc;
|
||||
} else if (j < Bn && (cur.second == UINT32_MAX || elB.first <= cur.second + 1)){
|
||||
cur.second = std::max(elB.second, cur.second);
|
||||
Binc;
|
||||
} else {
|
||||
res.push_back(cur);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
codeset_t intersect_sets(const codeset_t &A, const codeset_t &B) {
|
||||
codeset_t res;
|
||||
prepare
|
||||
while (true){
|
||||
if (Aended || Bended)
|
||||
break;
|
||||
if (elB.first <= elA.first && elA.first <= elB.second)
|
||||
res.emplace_back(elA.first, std::min(elA.second, elB.second));
|
||||
else if (elA.first <= elB.first && elB.first <= elA.second)
|
||||
res.emplace_back(elB.first, std::min(elA.second, elB.second));
|
||||
|
||||
if (elA.second <= elB.second)
|
||||
Ainc;
|
||||
else
|
||||
Binc;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
codeset_t subtract_sets(const codeset_t &A, const codeset_t &B) {
|
||||
return intersect_sets(A, invert_set(B));
|
||||
}
|
||||
|
||||
bool is_inside(uint32_t start, uint32_t end, codeset_t &X) {
|
||||
for (auto& p: X){
|
||||
if (p.first <= start && end <= p.second)
|
||||
return true;
|
||||
assert(end < p.first || p.second < start);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
codeset_t set_add_char(const codeset_t& X, uint32_t cp) {
|
||||
return merge_sets(X, {{cp, cp}});
|
||||
}
|
||||
|
||||
codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end) {
|
||||
return merge_sets(X, {{start, end}});
|
||||
}
|
||||
|
||||
codeset_t codeset_of_one_char(uint32_t ch) {
|
||||
return codeset_t({{ch, ch}});
|
||||
}
|
||||
|
||||
std::string stringifyCodesetBase10(const codeset_t& CS) {
|
||||
std::string cs;
|
||||
for (auto p: CS) {
|
||||
if (!cs.empty())
|
||||
cs += "; ";
|
||||
cs += std::to_string(p.first) + "-" + std::to_string(p.second);
|
||||
}
|
||||
return cs;
|
||||
}
|
27
src/libregexis024fa/codeset.h
Normal file
27
src/libregexis024fa/codeset.h
Normal file
@ -0,0 +1,27 @@
|
||||
#ifndef LIBREGEXIS024_CODESET_H
|
||||
#define LIBREGEXIS024_CODESET_H
|
||||
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
|
||||
typedef std::vector<std::pair<uint32_t, uint32_t>> codeset_t;
|
||||
|
||||
codeset_t invert_set(const codeset_t& X);
|
||||
codeset_t merge_sets(const codeset_t& A, const codeset_t& B);
|
||||
codeset_t intersect_sets(const codeset_t& A, const codeset_t& B);
|
||||
codeset_t subtract_sets(const codeset_t& A, const codeset_t& B);
|
||||
|
||||
/* Aborts if segment in question hit the edge (unsafe function) */
|
||||
bool is_inside(uint32_t start, uint32_t end, codeset_t& X);
|
||||
|
||||
codeset_t set_add_char(const codeset_t& X, uint32_t cp);
|
||||
codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end);
|
||||
|
||||
codeset_t codeset_of_one_char(uint32_t ch);
|
||||
#define codeset_of_all codeset_t({{0, UINT32_MAX}})
|
||||
|
||||
std::string stringifyCodesetBase10(const codeset_t& CS);
|
||||
|
||||
#endif //LIBREGEXIS024_CODESET_H
|
183
src/libregexis024fa/colored_codeset.cpp
Normal file
183
src/libregexis024fa/colored_codeset.cpp
Normal file
@ -0,0 +1,183 @@
|
||||
#include <libregexis024fa/colored_codeset.h>
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
ColoredCodesetSegment::ColoredCodesetSegment(uint32_t color, uint32_t right_code): color(color), right_code(right_code) {}
|
||||
|
||||
ColoredCodesetSegmentList::ColoredCodesetSegmentList() {
|
||||
first = new ColoredCodesetSegment(0, UINT32_MAX);
|
||||
}
|
||||
|
||||
void ColoredCodesetSegmentList::replace_myself(const ColoredCodesetSegmentList &other) {
|
||||
assert(other.first);
|
||||
ColoredCodesetSegment** in_cur = &first;
|
||||
ColoredCodesetSegment* in_other = other.first;
|
||||
while (in_other) {
|
||||
*in_cur = new ColoredCodesetSegment(*in_other);
|
||||
in_cur = &((**in_cur).next);
|
||||
in_other = in_other->next;
|
||||
}
|
||||
}
|
||||
|
||||
ColoredCodesetSegmentList::ColoredCodesetSegmentList(const ColoredCodesetSegmentList &other) {
|
||||
replace_myself(other);
|
||||
}
|
||||
|
||||
void ColoredCodesetSegmentList::free_myself() {
|
||||
ColoredCodesetSegment* cur = first;
|
||||
while (cur) {
|
||||
ColoredCodesetSegment* nxt = cur->next;
|
||||
delete cur;
|
||||
cur = nxt;
|
||||
}
|
||||
}
|
||||
|
||||
ColoredCodesetSegmentList::~ColoredCodesetSegmentList() {
|
||||
free_myself();
|
||||
}
|
||||
|
||||
ColoredCodesetSegmentList& ColoredCodesetSegmentList::operator=(const ColoredCodesetSegmentList &other) {
|
||||
free_myself();
|
||||
replace_myself(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
ColoredCodeset::ColoredCodeset(uint64_t dummy_n): DummyN(dummy_n) {
|
||||
requests = {{}};
|
||||
}
|
||||
|
||||
void ColoredCodeset::split_phase(const codeset_t &X) {
|
||||
|
||||
uint32_t cA = 0;
|
||||
ColoredCodesetSegment* cur_seg = list.first;
|
||||
|
||||
uint32_t pi = 0;
|
||||
|
||||
auto advance_old = [&]()->void{
|
||||
cA = cur_seg->right_code + 1;
|
||||
cur_seg = cur_seg->next;
|
||||
};
|
||||
|
||||
/* How to use: splits are made from left to right. After each split cur_seg
|
||||
* points to the rightest among sub-segments of cur_segment. */
|
||||
auto SPLIT = [&](uint32_t code_before_split)->void {
|
||||
assert(code_before_split < cur_seg->right_code);
|
||||
ColoredCodesetSegment* new_next = new ColoredCodesetSegment(cur_seg->color, cur_seg->right_code);
|
||||
new_next->divisor_on_left = true;
|
||||
cur_seg->right_code = code_before_split;
|
||||
new_next->next = cur_seg->next;
|
||||
cur_seg->next = new_next;
|
||||
advance_old();
|
||||
};
|
||||
|
||||
while (cur_seg && pi < X.size()) {
|
||||
uint32_t cB = cur_seg->right_code;
|
||||
uint32_t L = X[pi].first, R = X[pi].second;
|
||||
|
||||
if (L < cA) {
|
||||
if (R != UINT32_MAX && R + 1 < cA) {
|
||||
pi++;
|
||||
} else if (R != UINT32_MAX && R + 1 == cA) {
|
||||
cur_seg->divisor_on_left = true;
|
||||
pi++;
|
||||
} else if (R < cB) {
|
||||
SPLIT(R);
|
||||
pi++;
|
||||
} else {
|
||||
advance_old();
|
||||
}
|
||||
} else if (L == cA) {
|
||||
cur_seg->divisor_on_left = true;
|
||||
if (R < cB) {
|
||||
SPLIT(R);
|
||||
pi++;
|
||||
} else {
|
||||
advance_old();
|
||||
}
|
||||
} else if (L <= cB) {
|
||||
SPLIT(L - 1);
|
||||
if (R < cB) {
|
||||
SPLIT(R);
|
||||
pi++;
|
||||
} else {
|
||||
advance_old();
|
||||
}
|
||||
} else {
|
||||
advance_old();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ColoredCodeset::apply_divisor(const codeset_t &X) {
|
||||
split_phase(X);
|
||||
size_t X_id = nxt_request_id++;
|
||||
size_t m = requests.size();
|
||||
size_t bm = m;
|
||||
std::vector<bool> skipped(bm, false);
|
||||
std::vector<bool> overlapped(bm, false);
|
||||
{
|
||||
bool inside = false;
|
||||
ColoredCodesetSegment* cur = list.first;
|
||||
while (cur) {
|
||||
inside = (inside != cur->divisor_on_left);
|
||||
if (inside) {
|
||||
overlapped[cur->color] = true;
|
||||
} else {
|
||||
skipped[cur->color] = true;
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
}
|
||||
std::vector<uint32_t> alt_color(bm, 0);
|
||||
for (size_t i = 0; i < bm; i++) {
|
||||
if (skipped[i] && overlapped[i]) {
|
||||
alt_color[i] = m++;
|
||||
requests.push_back(requests[i]);
|
||||
if (X_id >= DummyN)
|
||||
requests.back().push_back(X_id - DummyN);
|
||||
} else if (overlapped[i]) {
|
||||
if (X_id >= DummyN)
|
||||
requests[i].push_back(X_id - DummyN);
|
||||
} else
|
||||
assert(skipped[i]);
|
||||
}
|
||||
{
|
||||
bool inside = false;
|
||||
ColoredCodesetSegment* cur = list.first;
|
||||
while (cur) {
|
||||
inside = (inside != cur->divisor_on_left);
|
||||
cur->divisor_on_left = false;
|
||||
uint32_t c = cur->color;
|
||||
if (inside && skipped[c] && overlapped[c]) {
|
||||
cur->color = alt_color[c];
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ColoredCodeset::get_splits_of_non_dummy(std::vector<codeset_t> &res_input,
|
||||
std::vector<std::vector<size_t>> &res_color_to_requests) {
|
||||
size_t n = requests.size();
|
||||
std::vector<ssize_t> nonclean_to_clean(n, -1);
|
||||
res_color_to_requests = {};
|
||||
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
if (!requests[i].empty()) {
|
||||
nonclean_to_clean[i] = res_color_to_requests.size();
|
||||
res_color_to_requests.push_back(requests[i]);
|
||||
}
|
||||
}
|
||||
|
||||
ColoredCodesetSegment* cur = list.first;
|
||||
uint32_t L = 0;
|
||||
res_input.assign(res_color_to_requests.size(), {});
|
||||
while (cur) {
|
||||
size_t Sc = cur->color;
|
||||
if (nonclean_to_clean[Sc] >= 0) {
|
||||
res_input[nonclean_to_clean[Sc]].emplace_back(L, cur->right_code);
|
||||
}
|
||||
L = cur->right_code + 1;
|
||||
cur = cur->next;
|
||||
}
|
||||
}
|
66
src/libregexis024fa/colored_codeset.h
Normal file
66
src/libregexis024fa/colored_codeset.h
Normal file
@ -0,0 +1,66 @@
|
||||
#ifndef LIBREGEXIS024_COLORED_CODESET_H
|
||||
#define LIBREGEXIS024_COLORED_CODESET_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
|
||||
#include <libregexis024fa/codeset.h>
|
||||
|
||||
/* Used for determinizer. Nowhere else */
|
||||
|
||||
struct ColoredCodesetSegment {
|
||||
uint32_t color;
|
||||
uint32_t right_code;
|
||||
ColoredCodesetSegment* next = NULL;
|
||||
|
||||
/* Temporary varaible (used by apply_divisor() method) */
|
||||
bool divisor_on_left = false;
|
||||
|
||||
ColoredCodesetSegment(uint32_t color, uint32_t right_code);
|
||||
};
|
||||
|
||||
/* Warning!!! This stupid class is OOM-unsafe!!!
|
||||
* This is not an issue as far as you don't show any of it's instance to the user of libregexis024 */
|
||||
struct ColoredCodesetSegmentList {
|
||||
ColoredCodesetSegment* first = NULL;
|
||||
|
||||
ColoredCodesetSegmentList();
|
||||
|
||||
|
||||
void replace_myself(const ColoredCodesetSegmentList& other);
|
||||
|
||||
ColoredCodesetSegmentList(const ColoredCodesetSegmentList& other);
|
||||
|
||||
/* Use only internally */
|
||||
void free_myself();
|
||||
|
||||
~ColoredCodesetSegmentList();
|
||||
|
||||
ColoredCodesetSegmentList& operator=(const ColoredCodesetSegmentList& other);
|
||||
};
|
||||
|
||||
/* Highly unoptimized algorithm on this data structure O(C^2) time*/
|
||||
class ColoredCodeset {
|
||||
ColoredCodesetSegmentList list;
|
||||
/* Size of this vector is equal to the number of colors */
|
||||
std::vector<std::vector<size_t>> requests;
|
||||
uint64_t DummyN;
|
||||
size_t nxt_request_id = 0;
|
||||
|
||||
void split_phase(const codeset_t& X);
|
||||
public:
|
||||
/* First dummy_n split requests will be viewed as 'dummy requests', when complete map of splits is requested,
|
||||
* colors that are registed indide only dummy requests won't be returned. */
|
||||
ColoredCodeset(uint64_t dummy_n);
|
||||
|
||||
/* O(C, which is bad, but my library's compiler is already slow by itself, so who cares) */
|
||||
void apply_divisor(const codeset_t& X);
|
||||
|
||||
/* Returned 'requests' mapping will feature request id's with DummyN substituted from them */
|
||||
void get_splits_of_non_dummy(std::vector<codeset_t>& res_input,
|
||||
std::vector<std::vector<size_t>>& res_color_to_requests);
|
||||
};
|
||||
|
||||
|
||||
#endif
|
191
src/libregexis024fa/fa_first_stage_fix.cpp
Normal file
191
src/libregexis024fa/fa_first_stage_fix.cpp
Normal file
@ -0,0 +1,191 @@
|
||||
#include <libregexis024fa/fa_first_stage_fix.h>
|
||||
#include <libregexis024fa/misc_fa_funcs.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <assert.h>
|
||||
|
||||
// #ifdef LIBREGEXIS024_DEBUG
|
||||
// #include <debugging_regexis024/debug_through_graphviz.h>
|
||||
// #endif
|
||||
|
||||
REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa) {
|
||||
assert(sourceFa.start);
|
||||
REGEX_IS024_FA_FirstStageFixInfo info;
|
||||
|
||||
for (size_t I_scans = 0; I_scans < sourceFa.all.size(); I_scans++){
|
||||
FA_Node* beg = sourceFa.all[I_scans];
|
||||
if (beg->type != look_one_ahead)
|
||||
continue;
|
||||
FA_NodeOfLookOneAhead& loa = (*(FA_NodeOfLookOneAhead*)beg);
|
||||
codeset_t& restriction = loa.restriction;
|
||||
assert(loa.nxt_node);
|
||||
|
||||
struct Marked{
|
||||
FA_Node* node;
|
||||
size_t refs_from_my = 1;
|
||||
bool making_copy = false;
|
||||
FA_Node* copy = NULL;
|
||||
|
||||
explicit Marked(FA_Node *node) : node(node) {}
|
||||
};
|
||||
|
||||
std::vector<Marked> searched;
|
||||
searched.emplace_back(loa.nxt_node);
|
||||
beg->search_mark = 0;
|
||||
|
||||
for (size_t done = 0; done < searched.size(); done++){
|
||||
FA_Node& cur = *searched[done].node;
|
||||
for (FA_Node** nxtN : cur.get_all_empty_valid_transitions()){
|
||||
if ((**nxtN).search_mark == -1){
|
||||
assert((**nxtN).nodeId != loa.nodeId);
|
||||
(**nxtN).search_mark = (int64_t)searched.size();
|
||||
searched.emplace_back(*nxtN);
|
||||
} else {
|
||||
searched[(**nxtN).search_mark].refs_from_my++;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<FA_Node*> s2s;
|
||||
for (auto& v_sete: searched){
|
||||
if (v_sete.refs_from_my < v_sete.node->refs){
|
||||
v_sete.making_copy = true;
|
||||
s2s.push_back(v_sete.node);
|
||||
}
|
||||
}
|
||||
while (!s2s.empty()){
|
||||
FA_Node& m = *s2s.back(); s2s.pop_back();
|
||||
assert(searched[m.search_mark].making_copy);
|
||||
/* Beacuse of this operation source Fa is not read-only. It becomes useless after renerating resultFa */
|
||||
searched[m.search_mark].copy = copy_fa_node(m, sourceFa);
|
||||
|
||||
for (FA_Node** nxtN: m.get_all_empty_valid_transitions()){
|
||||
Marked& nxtNaux = searched[(**nxtN).search_mark];
|
||||
if (!nxtNaux.making_copy){
|
||||
nxtNaux.making_copy = true;
|
||||
s2s.push_back(*nxtN);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& v_sete : searched){
|
||||
FA_Node* my = v_sete.making_copy ? v_sete.copy : v_sete.node;
|
||||
for (FA_Node** nxtN: my->get_all_empty_valid_transitions()){
|
||||
Marked& nxtNaux = searched[(**nxtN).search_mark];
|
||||
if (nxtNaux.making_copy)
|
||||
reattach_fa_node_edge(nxtN, nxtNaux.copy);
|
||||
}
|
||||
my->apply_lookahead_restriction(restriction);
|
||||
if (my->type == match)
|
||||
info.fed_chars_extend_one_right = true;
|
||||
}
|
||||
|
||||
for (auto& v_sete: searched)
|
||||
v_sete.node->search_mark = -1;
|
||||
}
|
||||
|
||||
// show_fa_with_sxiv_after_dot(sourceFa, {{}, {}}, {});
|
||||
|
||||
{
|
||||
/* Now it's time to fill resultFa. Skipping all look one ahead's */
|
||||
auto skip_useless = [&](FA_Node* v) -> FA_Node* {
|
||||
while (v->type == look_one_ahead){
|
||||
v = ((FA_NodeOfLookOneAhead*)v)->nxt_node;
|
||||
}
|
||||
return v;
|
||||
};
|
||||
|
||||
resultFa.start = sourceFa.start;
|
||||
std::vector<FA_Node**> homework = {&(resultFa.start)};
|
||||
std::vector<FA_Node*> sourceIdToResNode(sourceFa.all.size(), NULL);
|
||||
|
||||
while (!homework.empty()) {
|
||||
FA_Node** vPtr = homework.back(); homework.pop_back();
|
||||
FA_Node* right_source_v = skip_useless(*vPtr);
|
||||
size_t vid = right_source_v->nodeId;
|
||||
if (!sourceIdToResNode[vid]) {
|
||||
sourceIdToResNode[vid] = copy_fa_node_to_another_fa(*right_source_v, resultFa);
|
||||
for (FA_Node** uuPtr: sourceIdToResNode[vid]->get_all_transitions())
|
||||
homework.push_back(uuPtr);
|
||||
}
|
||||
*vPtr = sourceIdToResNode[vid];
|
||||
sourceIdToResNode[vid]->refs++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
/* Guessing info.fed_chars_extend_one_left */
|
||||
size_t done = 0;
|
||||
std::vector<FA_Node*> searched;
|
||||
searched.push_back(resultFa.start);
|
||||
resultFa.start->search_mark = 0;
|
||||
while (done < searched.size()){
|
||||
if (searched[done]->type == look_one_behind){
|
||||
info.fed_chars_extend_one_left = true;
|
||||
break;
|
||||
}
|
||||
for (FA_Node** nxtN: searched[done]->get_all_empty_valid_transitions()){
|
||||
if ((**nxtN).search_mark < 0){
|
||||
(**nxtN).search_mark = 0;
|
||||
searched.push_back(*nxtN);
|
||||
}
|
||||
}
|
||||
done++;
|
||||
}
|
||||
for (FA_Node* d: searched)
|
||||
d->search_mark = -1;
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
FA_NodeOfOneCharRead* generate_alt_ending(const codeset_t& restriction, FA_Container& fa){
|
||||
FA_NodeOfOneCharRead* n1 = fa.makeOneCharRead(restriction, true);
|
||||
FA_NodeOfMatch* n2 = fa.makeMatch();
|
||||
n2->ext_filter_added = true; // Won't actually be used
|
||||
reattach_fa_node_edge(&(n1->nxt_node), n2);
|
||||
return n1;
|
||||
}
|
||||
|
||||
void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa,
|
||||
const REGEX_IS024_FA_FirstStageFixInfo &info1)
|
||||
{
|
||||
assert(resultFa.all.empty() && !resultFa.start);
|
||||
if (!sourceFa.start)
|
||||
return;
|
||||
// todo: rewrite first stage using that cool technique I just invented
|
||||
resultFa.start = sourceFa.start;
|
||||
// A vector of pointers in resutFa to nodes that belong to sourceFa. They should undergo a little bit of copying.
|
||||
std::vector<FA_Node**> homework = {&(resultFa.start)};
|
||||
// source node id s index. Element is NULL if no copy (in resultFa) exists and resFa node if copying was performed
|
||||
std::vector<FA_Node*> sourceIdToResNode(sourceFa.all.size(), NULL);
|
||||
while (!homework.empty()) {
|
||||
FA_Node** vPtr = homework.back(); homework.pop_back();
|
||||
FA_Node* sourceV = *vPtr; assert(sourceV);
|
||||
size_t sourceVId = sourceV->nodeId;
|
||||
if (!sourceIdToResNode[sourceVId]) {
|
||||
if (sourceV->type == match) {
|
||||
FA_NodeOfMatch& mn = dynamic_cast<FA_NodeOfMatch&>(*sourceV);
|
||||
FA_NodeOfMatch* res_mn = resultFa.makeMatch();
|
||||
if (mn.ext_filter_added && mn.pending_filter != codeset_of_all) {
|
||||
assert(info1.fed_chars_extend_one_right);
|
||||
FA_NodeOfOneCharRead* res_ocr2n = resultFa.makeOneCharRead(mn.pending_filter, true);
|
||||
reattach_nxt_node(res_ocr2n, res_mn);
|
||||
sourceIdToResNode[sourceVId] = res_ocr2n;
|
||||
} else {
|
||||
sourceIdToResNode[sourceVId] = res_mn;
|
||||
}
|
||||
} else {
|
||||
sourceIdToResNode[sourceVId] = copy_fa_node_to_another_fa(*sourceV, resultFa);
|
||||
/* O_o */
|
||||
for (FA_Node** uuPtr: sourceIdToResNode[sourceVId]->get_all_transitions())
|
||||
homework.push_back(uuPtr);
|
||||
}
|
||||
}
|
||||
*vPtr = sourceIdToResNode[sourceVId];
|
||||
sourceIdToResNode[sourceVId]->refs++;
|
||||
}
|
||||
|
||||
if (info1.fed_chars_extend_one_left) {
|
||||
FA_NodeOfOneCharRead* ns = resultFa.makeOneCharRead(codeset_of_all, true);
|
||||
yay_new_start(resultFa, ns);
|
||||
}
|
||||
}
|
18
src/libregexis024fa/fa_first_stage_fix.h
Normal file
18
src/libregexis024fa/fa_first_stage_fix.h
Normal file
@ -0,0 +1,18 @@
|
||||
#ifndef LIBREGEXIS024_FA_FIRST_STAGE_FIX_H
|
||||
#define LIBREGEXIS024_FA_FIRST_STAGE_FIX_H
|
||||
|
||||
#include "finite_automaton.h"
|
||||
|
||||
struct REGEX_IS024_FA_FirstStageFixInfo{
|
||||
bool fed_chars_extend_one_left = false;
|
||||
bool fed_chars_extend_one_right = false;
|
||||
};
|
||||
|
||||
/* Will look for look_one_ahead nodes and apply their filter to reading filters ahead *
|
||||
* sourceFa will be ruined. The output will be in resultFa */
|
||||
REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa);
|
||||
|
||||
void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa,
|
||||
const REGEX_IS024_FA_FirstStageFixInfo &info1);
|
||||
|
||||
#endif //LIBREGEXIS024_FA_FIRST_STAGE_FIX_H
|
665
src/libregexis024fa/fa_make_deterministic.cpp
Normal file
665
src/libregexis024fa/fa_make_deterministic.cpp
Normal file
@ -0,0 +1,665 @@
|
||||
#include <libregexis024fa/fa_make_deterministic.h>
|
||||
#include <libregexis024fa/misc_fa_funcs.h>
|
||||
#include <libregexis024vm/utils.h> /* to get exitf */
|
||||
#include <assert.h>
|
||||
#include <libregexis024fa/tracking_fa_nodes.h>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <libregexis024fa/colored_codeset.h>
|
||||
|
||||
#if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD)
|
||||
#include <debugging_regexis024/prettyprint/prettyprint_util.h>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <stdio.h>
|
||||
#define PR_DEB
|
||||
#endif
|
||||
|
||||
/* debug nonsence */
|
||||
void input_fa_assert(const FA_Container& fa){
|
||||
assert(fa.start);
|
||||
for (FA_Node* node: fa.all){
|
||||
if (node->type == one_char_read){
|
||||
assert(!dynamic_cast<FA_NodeOfOneCharRead*>(node)->second_ns);
|
||||
} else if (node->type == look_one_ahead ||
|
||||
node->type == det_char_crossroads){
|
||||
exitf("not allowed at this stage\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct OperHistoryNodeTransition {
|
||||
TrackingOperationInFa op;
|
||||
size_t u;
|
||||
|
||||
OperHistoryNodeTransition(const TrackingOperationInFa &op, size_t u): op(op), u(u) {}
|
||||
};
|
||||
|
||||
struct OperHistoryNode {
|
||||
std::vector<OperHistoryNodeTransition> next;
|
||||
/* When it is part of clean history, this */
|
||||
std::vector<uint64_t> compressed_selarr;
|
||||
std::vector<uint64_t> raisin;
|
||||
|
||||
OperHistoryNode() = default;
|
||||
};
|
||||
|
||||
/* This object can describe an empty superstate (needed to describe clean history nodes without raisin)
|
||||
* If det_stops is empty, interpret it as empty superstate */
|
||||
struct SuperState {
|
||||
std::vector<uint64_t> sorted_raisin;
|
||||
std::vector<uint64_t> double_compressed_selarr;
|
||||
|
||||
bool empty() const {
|
||||
return sorted_raisin.empty();
|
||||
}
|
||||
|
||||
#ifdef PR_DEB
|
||||
std::string toString() const {
|
||||
std::string f1_raisin;
|
||||
for (uint64_t el: sorted_raisin) {
|
||||
if (!f1_raisin.empty())
|
||||
f1_raisin += ", ";
|
||||
f1_raisin += std::to_string(el);
|
||||
}
|
||||
std::string f2_selarr;
|
||||
for (uint64_t el: double_compressed_selarr) {
|
||||
if (!f2_selarr.empty())
|
||||
f2_selarr += ", ";
|
||||
f2_selarr += std::to_string(el);
|
||||
}
|
||||
|
||||
return "sorted_raisin: {" + f1_raisin + "}, double_comp_selarr: {" + f2_selarr + "}";
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
struct CleanOperHistoryNode {
|
||||
std::vector<OperHistoryNodeTransition> next;
|
||||
SuperState exit;
|
||||
};
|
||||
|
||||
struct SelarrCompressionScheme {
|
||||
size_t SN1, SN2 = 0, SN3 = 0;
|
||||
std::vector<int32_t> S1_to_S2;
|
||||
std::vector<regex_tai_t> S2_to_sifter;
|
||||
std::vector<regex_tai_t> S3_to_sifter;
|
||||
const RegexPriorityTable& sifter;
|
||||
|
||||
SelarrCompressionScheme(size_t sn1, const RegexPriorityTable &sifter) : SN1(sn1), sifter(sifter) {
|
||||
assert(sifter.size() <= UINT32_MAX);
|
||||
S1_to_S2.assign(SN1, -1);
|
||||
for (regex_tai_t i = 0; i < sifter.size(); i++) {
|
||||
auto& act = sifter[i].pos;
|
||||
regex_tai_t first_on_s2 = S2_to_sifter.size();
|
||||
S2_to_sifter.push_back(i);
|
||||
S1_to_S2[act.first] = first_on_s2;
|
||||
if (act.type != tracking_var_types::dot_cur_pos) {
|
||||
S3_to_sifter.push_back(i);
|
||||
}
|
||||
if (act.type == tracking_var_types::range) {
|
||||
regex_tai_t second_on_s2 = S2_to_sifter.size();
|
||||
S2_to_sifter.push_back(i);
|
||||
S1_to_S2[act.second] = second_on_s2;
|
||||
}
|
||||
}
|
||||
SN2 = S2_to_sifter.size();
|
||||
SN3 = S3_to_sifter.size();
|
||||
assert(SN3 <= SN2 && SN2 <= SN1 && SN1 <= UINT16_MAX);
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<uint64_t> compress_compressed_selarr(const std::vector<uint64_t>& S2,
|
||||
const SelarrCompressionScheme& cmp) {
|
||||
std::vector<uint64_t> S3(cmp.SN3);
|
||||
for (size_t i = 0; i < cmp.SN3; i++) {
|
||||
const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos;
|
||||
if (act.type == tracking_var_types::dot_immediate) {
|
||||
S3[i] = S2[cmp.S1_to_S2[act.first]];
|
||||
} else {
|
||||
assert(act.type == tracking_var_types::range); // It must be range type
|
||||
uint64_t onFirstBorder = S2[cmp.S1_to_S2[act.first]];
|
||||
uint64_t onSecondBorder = S2[cmp.S1_to_S2[act.second]];
|
||||
S3[i] = (onFirstBorder > onSecondBorder) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
return S3;
|
||||
}
|
||||
|
||||
bool compressed_selarr_A_outranks_B(const std::vector<uint64_t>& A, const std::vector<uint64_t>& B,
|
||||
const SelarrCompressionScheme& cmp) {
|
||||
for (const RegexPriorityTableAction& act: cmp.sifter) {
|
||||
uint64_t valA = A[cmp.S1_to_S2[act.pos.first]];
|
||||
uint64_t valB = B[cmp.S1_to_S2[act.pos.first]];
|
||||
if (act.pos.type == tracking_var_types::range) {
|
||||
uint64_t valAsecond = A[cmp.S1_to_S2[act.pos.second]];
|
||||
uint64_t valBsecond = A[cmp.S1_to_S2[act.pos.second]];
|
||||
valA = valAsecond > valA ? valAsecond - valA : 0;
|
||||
valB = valBsecond > valB ? valBsecond - valB : 0;
|
||||
}
|
||||
if (valA == valB)
|
||||
continue;
|
||||
return (valA < valB) == act.minimize;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Beacuse of the way wash_history_bush builds this structure, root is te last node.
|
||||
* rankdir is from left to right (guaranteed). Can be empty if original history contained no raisin */
|
||||
struct RaisinBush {
|
||||
std::vector<CleanOperHistoryNode> clean_history;
|
||||
ssize_t start = -1;
|
||||
|
||||
bool empty() const {
|
||||
return start < 0;
|
||||
}
|
||||
|
||||
#ifdef PR_DEB
|
||||
void print() {
|
||||
lines text;
|
||||
text.push_back("Raisin bush");
|
||||
if (start >= 0) {
|
||||
size_t n = clean_history.size();
|
||||
std::vector<bool> m(n, false);
|
||||
TreeWithStringsNode e{""};
|
||||
std::function<void(TreeWithStringsNode&, size_t)> dfs = [&]
|
||||
(TreeWithStringsNode& fill, size_t nodeId)
|
||||
{
|
||||
if (m[nodeId]) {
|
||||
fill.val = "PARADOX";
|
||||
return;
|
||||
}
|
||||
m[nodeId] = true;
|
||||
const CleanOperHistoryNode& node = clean_history[nodeId];
|
||||
fill.val = "[" + std::to_string(nodeId) + "]";
|
||||
if (!node.exit.empty())
|
||||
fill.val += (" EXIT: " + node.exit.toString());
|
||||
size_t CN = node.next.size();
|
||||
fill.childeren.resize(CN);
|
||||
for (size_t i = 0; i < CN; i++) {
|
||||
fill.childeren[i].val = node.next[i].op.toString();
|
||||
fill.childeren[i].childeren = {{}};
|
||||
dfs(fill.childeren[i].childeren[0], node.next[i].u);
|
||||
}
|
||||
};
|
||||
dfs(e, start);
|
||||
size_t am = 0;
|
||||
for (bool el: m)
|
||||
am += static_cast<size_t>(el);
|
||||
if (am < n)
|
||||
text[0] += ": " + std::to_string(n - am) + " nodes are unreachable by detour";
|
||||
e.toLines(text);
|
||||
} else {
|
||||
if (clean_history.empty())
|
||||
text[0] = "Empty Raisin Bush";
|
||||
else
|
||||
text [0] = "Raisin bush with no root and " + std::to_string(clean_history.size()) = " nodes missed";
|
||||
}
|
||||
printLines(wrapWithBox(text));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
void wash_history_bush(const std::vector<OperHistoryNode>& history, RaisinBush& answer,
|
||||
const SelarrCompressionScheme& cmp) {
|
||||
assert(!history.empty());
|
||||
std::vector<bool> has_raisin(history.size());
|
||||
std::vector<ssize_t> dirty_to_clean(history.size(), -1);
|
||||
std::vector<std::pair<size_t, size_t> > callStack = {{0, 0}};
|
||||
|
||||
auto hist_clean_detour_init_clean = [&](uint64_t v) -> uint64_t {
|
||||
if (!has_raisin[v]) {
|
||||
has_raisin[v] = true;
|
||||
dirty_to_clean[v] = answer.clean_history.size();
|
||||
answer.clean_history.emplace_back();
|
||||
}
|
||||
return dirty_to_clean[v];
|
||||
};
|
||||
|
||||
while (!callStack.empty()) {
|
||||
size_t v = callStack.back().first;
|
||||
size_t od = callStack.back().second;
|
||||
if (od == 0) {
|
||||
if (!history[v].raisin.empty()) {
|
||||
size_t cleanVId = hist_clean_detour_init_clean(v);
|
||||
std::vector<uint64_t>& sr = answer.clean_history[cleanVId].exit.sorted_raisin;
|
||||
sr = history[v].raisin;
|
||||
std::sort(sr.begin(), sr.end());
|
||||
answer.clean_history[cleanVId].exit.double_compressed_selarr = compress_compressed_selarr(history[v].compressed_selarr, cmp);
|
||||
}
|
||||
} else {
|
||||
const OperHistoryNodeTransition& old_hist_tr = history[v].next[od - 1];
|
||||
uint64_t ou = old_hist_tr.u;
|
||||
if (has_raisin[ou]) {
|
||||
size_t cleanVId = hist_clean_detour_init_clean(v);
|
||||
answer.clean_history[cleanVId].next.emplace_back(old_hist_tr.op, dirty_to_clean[ou]);
|
||||
}
|
||||
}
|
||||
|
||||
if (od == history[v].next.size()) {
|
||||
callStack.pop_back();
|
||||
} else {
|
||||
callStack.back().second++;
|
||||
callStack.emplace_back(history[v].next[od].u, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (has_raisin[0]) {
|
||||
assert(dirty_to_clean[0] >= 0);
|
||||
answer.start = dirty_to_clean[0];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* If is_it_after_read is false, unknown selarr range variable border and cur pos are evaluated to 0.
|
||||
* Otherwise, cur pos considered to be greater than previous values of selarr ange variable boundaries */
|
||||
void building_detour(const SelarrCompressionScheme& cmp,
|
||||
const std::vector<uint64_t>& outer_selarr, const std::vector<FA_Node*>& zeroeps, const codeset_t& I,
|
||||
RaisinBush& answer, bool is_it_after_read)
|
||||
{
|
||||
#ifdef PR_DEB
|
||||
printf("Det Debug: build_detour started with zeroeps:{");
|
||||
for (FA_Node* node: zeroeps)
|
||||
printf("%lu,", node->nodeId);
|
||||
printf("}, I: {%s}\n", stringifyCodesetBase10(I).c_str());
|
||||
#endif
|
||||
assert(cmp.SN3 == outer_selarr.size());
|
||||
if (!is_it_after_read)
|
||||
for (uint64_t val: outer_selarr)
|
||||
assert(val == 0);
|
||||
|
||||
struct SearchMark {
|
||||
FA_Node* domain_node;
|
||||
uint64_t epsilon_refs = 0;
|
||||
uint64_t detour_sat = 0;
|
||||
/* id of corresponding history node */
|
||||
size_t Hv = 0;
|
||||
|
||||
explicit SearchMark(FA_Node *domain_node) : domain_node(domain_node) {}
|
||||
};
|
||||
|
||||
/* Default values are good for me */
|
||||
std::vector<SearchMark> marks;
|
||||
for (size_t i = 0; i < zeroeps.size(); i++) {
|
||||
marks.emplace_back(zeroeps[i]);
|
||||
zeroeps[i]->search_mark = i;
|
||||
}
|
||||
|
||||
auto lob_allows_to_pass = [&](FA_NodeOfLookOneBehind* lob) -> bool {
|
||||
if (!intersect_sets(lob->filter, I).empty()) {
|
||||
assert(merge_sets(lob->filter, I) == lob->filter);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
{ /* First i need to know exacly how many of MINE epsilon transitions are referencing each NODE */
|
||||
std::vector<FA_Node*> domain_detour = zeroeps;
|
||||
while (!domain_detour.empty()) {
|
||||
FA_Node* v = domain_detour.back(); domain_detour.pop_back();
|
||||
if (v->type == look_one_behind && !lob_allows_to_pass(dynamic_cast<FA_NodeOfLookOneBehind*>(v)))
|
||||
continue;
|
||||
for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) {
|
||||
assert(*uPtr);
|
||||
int64_t &rds = (**uPtr).search_mark;
|
||||
if (rds == -1) {
|
||||
rds = marks.size();
|
||||
domain_detour.push_back(*uPtr);
|
||||
marks.emplace_back(*uPtr);
|
||||
}
|
||||
marks[rds].epsilon_refs++;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<OperHistoryNode> history = {OperHistoryNode()};
|
||||
history[0].compressed_selarr.assign(cmp.SN2, 0);
|
||||
for (size_t i = 0; i < cmp.SN3; i++) {
|
||||
const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos;
|
||||
if (act.type == tracking_var_types::range) {
|
||||
if (outer_selarr[i]) {
|
||||
history[0].compressed_selarr[cmp.S1_to_S2[act.second]] = 1;
|
||||
}
|
||||
} else {
|
||||
assert(act.type == tracking_var_types::dot_immediate);
|
||||
history[0].compressed_selarr[cmp.S1_to_S2[act.first]] = outer_selarr[i];
|
||||
}
|
||||
}
|
||||
/* As a result, dot_cur_pos variables will be initialized as zero (always) */
|
||||
|
||||
/* In my second detour, I will pass each vertex here only one time: after hitting the total epsilon refcount */
|
||||
std::vector<FA_Node*> can_process = zeroeps;
|
||||
/*
|
||||
auto increase_sat_refcount = [&](SearchMark& mark) {
|
||||
mark.detour_sat++;
|
||||
if (mark.detour_sat == mark.epsilon_refs && mark.ever_walked_in) {
|
||||
can_process.push_back(mark.domain_node);
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
auto add_history_update = [&](TrackingOperationInFa how, uint64_t where, uint64_t from_where) {
|
||||
history[from_where].next.emplace_back(how, where);
|
||||
};
|
||||
|
||||
while (!can_process.empty()) {
|
||||
FA_Node* v = can_process.back(); can_process.pop_back();
|
||||
SearchMark& Vmark = marks[v->search_mark];
|
||||
assert(Vmark.detour_sat == Vmark.epsilon_refs);
|
||||
uint64_t Hv = Vmark.Hv;
|
||||
uint64_t Hop = Hv;
|
||||
if (v->type == look_one_behind) {
|
||||
FA_NodeOfLookOneBehind* tv = dynamic_cast<FA_NodeOfLookOneBehind*>(v);
|
||||
if (!lob_allows_to_pass(tv))
|
||||
continue;
|
||||
} else if (isTrackingFaNode(v)) {
|
||||
Hop = history.size();
|
||||
history.emplace_back();
|
||||
std::vector<uint64_t>& val2 = history.back().compressed_selarr;
|
||||
val2 = history[Hv].compressed_selarr;
|
||||
if (v->type == track_array_mov_imm) {
|
||||
FA_NodeOfTrackArrayMovImm* tv = dynamic_cast<FA_NodeOfTrackArrayMovImm*>(v);
|
||||
if (isSelarrOpcode(tv->operation)) {
|
||||
int key_s2 = cmp.S1_to_S2[tv->key];
|
||||
if (key_s2 >= 0){
|
||||
assert(cmp.sifter[cmp.S2_to_sifter[key_s2]].pos.type == tracking_var_types::dot_immediate);
|
||||
val2[key_s2] = tv->imm_value;
|
||||
}
|
||||
}
|
||||
add_history_update(TrackingOperationInFa(tv->operation, tv->key, tv->imm_value), Hop, Hv);
|
||||
} else if (v->type == track_array_mov_halfinvariant) {
|
||||
FA_NodeOfTrackArrayMovHalfinvariant* tv = dynamic_cast<FA_NodeOfTrackArrayMovHalfinvariant*>(v);
|
||||
if (isSelarrOpcode(tv->operation)) {
|
||||
int key_s2 = cmp.S1_to_S2[tv->key];
|
||||
if (key_s2 >= 0){
|
||||
const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S2_to_sifter[key_s2]].pos;
|
||||
assert(act.type != tracking_var_types::dot_immediate);
|
||||
if (act.type == tracking_var_types::dot_cur_pos) {
|
||||
val2[key_s2] = is_it_after_read ? 1 : 0;
|
||||
} else {
|
||||
val2[key_s2] = is_it_after_read ? 2 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
add_history_update(TrackingOperationInFa(tv->operation, tv->key), Hop, Hv);
|
||||
}
|
||||
} else if (v->type == match || v->type == one_char_read) {
|
||||
// Determinization stop
|
||||
history[Hv].raisin.push_back(v->nodeId);
|
||||
}
|
||||
for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) {
|
||||
assert(*uPtr);
|
||||
SearchMark& Umark = marks[(**uPtr).search_mark];
|
||||
/* Here I use Hop to determine Hv value of u */
|
||||
if (Umark.detour_sat == 0) {
|
||||
Umark.Hv = Hop;
|
||||
} else if (Umark.Hv != Hop) {
|
||||
if (compressed_selarr_A_outranks_B(
|
||||
history[Hop].compressed_selarr, history[Umark.Hv].compressed_selarr, cmp)){
|
||||
Umark.Hv = Hop;
|
||||
}
|
||||
}
|
||||
/* Collision calculation finished */
|
||||
Umark.detour_sat++;
|
||||
if (Umark.detour_sat == Umark.epsilon_refs) {
|
||||
can_process.push_back(Umark.domain_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Cleaning this mess */
|
||||
for (auto& m: marks) {
|
||||
m.domain_node->search_mark = -1;
|
||||
}
|
||||
/* Packaging the answer (we do a little bit of dfs here) */
|
||||
wash_history_bush(history, answer, cmp);
|
||||
}
|
||||
|
||||
void update_had_to_fork_status(const RaisinBush& bush, int& had_to_fork) {
|
||||
for (const CleanOperHistoryNode& node: bush.clean_history) {
|
||||
if (node.next.size() > 1 || (!node.next.empty() && !node.exit.empty())) {
|
||||
had_to_fork = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef size_t superstate_id_t;
|
||||
|
||||
typedef std::vector<std::pair<FA_Node**, superstate_id_t>> homework_t;
|
||||
|
||||
struct LessSuperState {
|
||||
bool operator()(const SuperState& A, const SuperState& B) const {
|
||||
std::less<std::vector<uint64_t>> f1L;
|
||||
if (f1L(A.sorted_raisin, B.sorted_raisin))
|
||||
return true;
|
||||
if (f1L(B.sorted_raisin, A.sorted_raisin))
|
||||
return false;
|
||||
return f1L(A.double_compressed_selarr, B.double_compressed_selarr);
|
||||
}
|
||||
};
|
||||
|
||||
struct GlobalDetourProgress {
|
||||
std::map<SuperState, superstate_id_t, LessSuperState> superstates;
|
||||
/* Each element is a root of some megabush in resFa */
|
||||
std::vector<FA_Node*> superstate_megabush_constructed;
|
||||
std::vector<SuperState> todo_superstaes;
|
||||
};
|
||||
|
||||
/* If x was not previously achieved, it will also add it to t o d o list of global detour */
|
||||
superstate_id_t convertSuperstateToId(const SuperState& x, GlobalDetourProgress& gdp) {
|
||||
if (gdp.superstates.count(x)) {
|
||||
return gdp.superstates[x];
|
||||
}
|
||||
size_t n = gdp.superstates.size();
|
||||
gdp.superstates.insert({x, n});
|
||||
gdp.todo_superstaes.push_back(x);
|
||||
gdp.superstate_megabush_constructed.push_back(NULL);
|
||||
return n;
|
||||
}
|
||||
|
||||
FA_Node* build_dead_end(FA_Container& resFa) {
|
||||
return resFa.makeForking();
|
||||
}
|
||||
|
||||
void build_bush(const RaisinBush& alpha, FA_Node** sowing_location, FA_Container& resFa,
|
||||
homework_t& homework, GlobalDetourProgress& gdp) {
|
||||
size_t n = alpha.clean_history.size();
|
||||
if (n == 0) {
|
||||
FA_Node* dead_end = build_dead_end(resFa);
|
||||
reattach_fa_node_edge(sowing_location, dead_end);
|
||||
return;
|
||||
}
|
||||
std::vector<std::pair<FA_Node**, size_t>> todo = {{sowing_location, alpha.start}};
|
||||
|
||||
while (!todo.empty()) {
|
||||
FA_Node** sl = todo.back().first;
|
||||
const CleanOperHistoryNode& hnode = alpha.clean_history[todo.back().second];
|
||||
todo.pop_back();
|
||||
auto history_transition = [&](size_t i, FA_Node** of_sl) {
|
||||
FA_NodePathPart* pn = convert_to_node(hnode.next[i].op, resFa);
|
||||
reattach_fa_node_edge(of_sl, pn);
|
||||
todo.emplace_back(&(pn->nxt_node), hnode.next[i].u);
|
||||
};
|
||||
|
||||
if (hnode.next.empty()) {
|
||||
assert(!hnode.exit.empty());
|
||||
superstate_id_t w = convertSuperstateToId(hnode.exit, gdp);
|
||||
homework.emplace_back(sl, w);
|
||||
} else if (hnode.next.size() == 1 && hnode.exit.empty()) {
|
||||
history_transition(0, sl);
|
||||
} else {
|
||||
FA_NodeOfForking* forker = resFa.makeForking();
|
||||
bool raisin = !hnode.exit.empty();
|
||||
size_t k = hnode.next.size();
|
||||
forker->nxt_options.assign(k + static_cast<size_t>(raisin), NULL);
|
||||
for (size_t i = 0; i < k; i++) {
|
||||
history_transition(i, &(forker->nxt_options[i]));
|
||||
}
|
||||
if (raisin) {
|
||||
superstate_id_t w = convertSuperstateToId(hnode.exit, gdp);
|
||||
homework.emplace_back(&(forker->nxt_options[k]), w);
|
||||
}
|
||||
reattach_fa_node_edge(sl, forker);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ColoredCodeset get_pretreated_cc(FA_Container& sourceFa) {
|
||||
std::set<codeset_t> little_insects;
|
||||
for (FA_Node* v: sourceFa.all) {
|
||||
if (v->type == look_one_behind) {
|
||||
little_insects.insert(static_cast<FA_NodeOfLookOneBehind*>(v)->filter);
|
||||
}
|
||||
}
|
||||
ColoredCodeset pretreated_cc(little_insects.size());
|
||||
for (const codeset_t& cs: little_insects) {
|
||||
pretreated_cc.apply_divisor(cs);
|
||||
}
|
||||
return pretreated_cc;
|
||||
}
|
||||
|
||||
// todo add a check on size of dfa
|
||||
void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz,
|
||||
const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork)
|
||||
{
|
||||
/* During execuion, i will create pointers to field res.start and store them (inside the scope of this function)
|
||||
* Luckily res argument is already immovable in this scope. */
|
||||
error = 0;
|
||||
had_to_fork = 0;
|
||||
assert(resFa.start == NULL && resFa.all.empty());
|
||||
input_fa_assert(sourceFa);
|
||||
SelarrCompressionScheme cmp(selarr_sz, sifter);
|
||||
|
||||
GlobalDetourProgress gdp;
|
||||
homework_t homework;
|
||||
|
||||
ColoredCodeset pretreated_cc = get_pretreated_cc(sourceFa);
|
||||
|
||||
FA_Node** res_start_ptr = &(resFa.start);
|
||||
if (info1.fed_chars_extend_one_left) {
|
||||
ColoredCodeset inp_distinction = pretreated_cc;
|
||||
inp_distinction.apply_divisor(codeset_of_all);
|
||||
std::vector<codeset_t> starting_Is;
|
||||
std::vector<std::vector<size_t>> starting_Cids; /* Filler variable */
|
||||
inp_distinction.get_splits_of_non_dummy(starting_Is, starting_Cids);
|
||||
size_t R = starting_Is.size();
|
||||
for (auto& rdh: starting_Cids) {
|
||||
assert(rdh.size() == 1 && rdh[0] == 0);
|
||||
}
|
||||
FA_NodeOfDetCharCrossroads* very_first_cr = resFa.makeDetCharCrossroads();
|
||||
very_first_cr->second_ns = true;
|
||||
reattach_fa_node_edge(res_start_ptr, very_first_cr);
|
||||
very_first_cr->crossroads.resize(R); /* After that, nobody has right to resize crossroads array */
|
||||
for (size_t i = 0; i < R; i++) {
|
||||
very_first_cr->crossroads[i].input = starting_Is[i];
|
||||
FA_Node** sowing_place = &(very_first_cr->crossroads[i].nxt_node);
|
||||
RaisinBush alpha;
|
||||
building_detour(cmp, std::vector<uint64_t>(cmp.SN3, 0), {sourceFa.start}, starting_Is[i], alpha, false);
|
||||
#ifdef PR_DEB
|
||||
printf("Initialization hard %ld/%ld\n", i + 1, R);
|
||||
alpha.print();
|
||||
#endif
|
||||
update_had_to_fork_status(alpha, had_to_fork);
|
||||
build_bush(alpha, sowing_place, resFa, homework, gdp);
|
||||
}
|
||||
} else {
|
||||
RaisinBush alpha;
|
||||
building_detour(cmp, std::vector<uint64_t>(cmp.SN3, 0), {sourceFa.start}, codeset_of_all, alpha, false);
|
||||
#ifdef PR_DEB
|
||||
printf("Initialization easy\n");
|
||||
alpha.print();
|
||||
#endif
|
||||
update_had_to_fork_status(alpha, had_to_fork);
|
||||
build_bush(alpha, res_start_ptr, resFa, homework, gdp);
|
||||
}
|
||||
/* Now we start the actual detour. */
|
||||
while (!gdp.todo_superstaes.empty()) {
|
||||
SuperState SS = gdp.todo_superstaes.back(); gdp.todo_superstaes.pop_back();
|
||||
// printf("Global detour turn: %s\n", SS.toString().c_str());
|
||||
std::vector<FA_NodeOfOneCharRead*> reading_stops;
|
||||
codeset_t how_can_i_finish = {};
|
||||
for (size_t v: SS.sorted_raisin) {
|
||||
FA_Node* node = sourceFa.all[v];
|
||||
if (node->type == one_char_read) {
|
||||
reading_stops.push_back(static_cast<FA_NodeOfOneCharRead*>(node));
|
||||
} else if (node->type == match) {
|
||||
auto fn = static_cast<FA_NodeOfMatch*>(node);
|
||||
assert(!fn->ext_filter_added || info1.fed_chars_extend_one_right);
|
||||
if (fn->ext_filter_added) {
|
||||
how_can_i_finish = merge_sets(how_can_i_finish, fn->pending_filter);
|
||||
} else {
|
||||
how_can_i_finish = codeset_of_all;
|
||||
}
|
||||
} else
|
||||
assert(false);
|
||||
}
|
||||
// Determinization stop: one char read (input)
|
||||
ColoredCodeset inp_distinction = pretreated_cc;
|
||||
size_t pr = reading_stops.size();
|
||||
for (size_t i = 0; i < pr; i++) {
|
||||
inp_distinction.apply_divisor(reading_stops[i]->filter);
|
||||
}
|
||||
std::vector<codeset_t> Is;
|
||||
std::vector<std::vector<size_t>> Cids;
|
||||
inp_distinction.get_splits_of_non_dummy(Is, Cids);
|
||||
size_t R = Is.size();
|
||||
FA_NodeOfDetCharCrossroads* my_cr = NULL;
|
||||
if (R > 0) {
|
||||
my_cr = resFa.makeDetCharCrossroads();
|
||||
if (!info1.fed_chars_extend_one_right && !how_can_i_finish.empty()) {
|
||||
assert(how_can_i_finish == codeset_of_all);
|
||||
my_cr->matching = true;
|
||||
}
|
||||
my_cr->crossroads.resize(R);
|
||||
}
|
||||
for (size_t i = 0; i < R; i++) {
|
||||
my_cr->crossroads[i].input = Is[i];
|
||||
my_cr->crossroads[i].nxt_node = NULL;
|
||||
std::vector<FA_Node*> fl_passed_filters;
|
||||
for (size_t j: Cids[i]) {
|
||||
fl_passed_filters.push_back(reading_stops[j]->nxt_node);
|
||||
}
|
||||
// todo: make a function out of next 6 lines of code
|
||||
RaisinBush alpha;
|
||||
building_detour(cmp, SS.double_compressed_selarr, fl_passed_filters, Is[i], alpha, true);
|
||||
#ifdef PR_DEB
|
||||
printf("That same turn, subbush %ld/%ld\n", i + 1, R);
|
||||
alpha.print();
|
||||
#endif
|
||||
update_had_to_fork_status(alpha, had_to_fork);
|
||||
build_bush(alpha, &(my_cr->crossroads[i].nxt_node), resFa, homework, gdp);
|
||||
}
|
||||
// Determinization stop: match (finish)
|
||||
FA_Node* finish_route = NULL;
|
||||
if (!how_can_i_finish.empty() && (info1.fed_chars_extend_one_right || R == 0)) {
|
||||
FA_NodeOfMatch* matcher = resFa.makeMatch();
|
||||
finish_route = matcher;
|
||||
if (info1.fed_chars_extend_one_right) {
|
||||
FA_NodeOfOneCharRead* right_ext_read = resFa.makeOneCharRead(how_can_i_finish, true);
|
||||
reattach_nxt_node(right_ext_read, matcher);
|
||||
finish_route = right_ext_read;
|
||||
}
|
||||
}
|
||||
// Combining these two cases
|
||||
assert(finish_route || my_cr);
|
||||
FA_Node*& endsUp = gdp.superstate_megabush_constructed[gdp.superstates[SS]];
|
||||
if (!finish_route) {
|
||||
endsUp = my_cr;
|
||||
} else if (!my_cr) {
|
||||
endsUp = finish_route;
|
||||
} else {
|
||||
FA_NodeOfForking* F = resFa.makeForking();
|
||||
F->nxt_options = {NULL, NULL};
|
||||
reattach_fa_node_edge(&(F->nxt_options[0]), my_cr);
|
||||
reattach_fa_node_edge(&(F->nxt_options[1]), finish_route);
|
||||
endsUp = F;
|
||||
}
|
||||
}
|
||||
/* Now it's time to do the homework: link all megabushes */
|
||||
for (auto& p: homework) {
|
||||
reattach_fa_node_edge(p.first, gdp.superstate_megabush_constructed[p.second]);
|
||||
}
|
||||
}
|
||||
|
10
src/libregexis024fa/fa_make_deterministic.h
Normal file
10
src/libregexis024fa/fa_make_deterministic.h
Normal file
@ -0,0 +1,10 @@
|
||||
#ifndef LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H
|
||||
#define LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H
|
||||
|
||||
#include <libregexis024fa/fa_first_stage_fix.h>
|
||||
#include <libregexis024fa/selarr_priority_table.h>
|
||||
|
||||
void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz,
|
||||
const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork);
|
||||
|
||||
#endif //LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H
|
141
src/libregexis024fa/finite_automaton.cpp
Normal file
141
src/libregexis024fa/finite_automaton.cpp
Normal file
@ -0,0 +1,141 @@
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <assert.h>
|
||||
|
||||
bool FA_Node::empty() {
|
||||
return type != one_char_read && type != det_char_crossroads;
|
||||
}
|
||||
|
||||
void FA_Node::apply_lookahead_restriction(const codeset_t &restriction) {}
|
||||
|
||||
void FA_Node::reAdd_references() {
|
||||
for (FA_Node** nxtPtr: get_all_transitions()){
|
||||
if (*nxtPtr)
|
||||
(**nxtPtr).refs++;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_Node::get_all_transitions() {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_Node::get_all_empty_valid_transitions() {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_NodePathPart::get_all_transitions() {
|
||||
return {&nxt_node};
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_NodePathPart::get_all_empty_valid_transitions() {
|
||||
if (nxt_node)
|
||||
return {&nxt_node};
|
||||
return {};
|
||||
}
|
||||
|
||||
FA_NodeOfMatch::FA_NodeOfMatch() {type = match;}
|
||||
|
||||
void FA_NodeOfMatch::apply_lookahead_restriction(const codeset_t &restriction) {
|
||||
ext_filter_added = true;
|
||||
pending_filter = restriction;
|
||||
}
|
||||
|
||||
FA_NodeOfOneCharRead::FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace) : filter(filter),
|
||||
second_ns(second_namespace) { type = one_char_read;}
|
||||
|
||||
void FA_NodeOfOneCharRead::apply_lookahead_restriction(const codeset_t &restriction) {
|
||||
filter = intersect_sets(filter, restriction);
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_NodeOfOneCharRead::get_all_empty_valid_transitions() {
|
||||
return {};
|
||||
}
|
||||
|
||||
FA_NodeOfForking::FA_NodeOfForking() {type = forking;}
|
||||
|
||||
std::vector<FA_Node **> FA_NodeOfForking::get_all_empty_valid_transitions() {
|
||||
std::vector<FA_Node**> res;
|
||||
for (size_t i = 0; i < nxt_options.size(); i++)
|
||||
if (nxt_options[i])
|
||||
res.push_back(&nxt_options[i]);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_NodeOfForking::get_all_transitions() {
|
||||
std::vector<FA_Node**> res;
|
||||
for (size_t i = 0; i < nxt_options.size(); i++)
|
||||
res.push_back(&nxt_options[i]);
|
||||
return res;
|
||||
}
|
||||
|
||||
FA_NodeOfLookOneBehind::FA_NodeOfLookOneBehind(const codeset_t &filter) : filter(filter) {type = look_one_behind;}
|
||||
|
||||
FA_NodeOfLookOneAhead::FA_NodeOfLookOneAhead(const codeset_t &restriction) : restriction(restriction) {
|
||||
type = look_one_ahead;
|
||||
}
|
||||
|
||||
FA_NodeOfTrackArrayMovImm::FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue) :
|
||||
operation(operation), key(key), imm_value(immValue) {type = track_array_mov_imm;}
|
||||
//
|
||||
|
||||
FA_NodeOfTrackArrayMovHalfinvariant::FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key):
|
||||
operation(operation), key(key){type = track_array_mov_halfinvariant;}
|
||||
//
|
||||
|
||||
void FA_NodeOfDetCharCrossroads::apply_lookahead_restriction(const codeset_t &restriction) {
|
||||
exitf("What?? Oh, no, no. I am NOT doing it");
|
||||
}
|
||||
|
||||
FA_NodeOfDetCharCrossroads::FA_NodeOfDetCharCrossroads(const std::vector<DFA_CrossroadPath> &crossroads)
|
||||
: crossroads(crossroads) {type = det_char_crossroads;}
|
||||
|
||||
std::vector<FA_Node **> FA_NodeOfDetCharCrossroads::get_all_empty_valid_transitions() {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<FA_Node **> FA_NodeOfDetCharCrossroads::get_all_transitions() {
|
||||
std::vector<FA_Node**> res;
|
||||
for (auto& tr: crossroads)
|
||||
res.push_back(&tr.nxt_node);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* If transferring ownership of node to container has failed, node is freed (which means it is ivalidated)
|
||||
* If this semi-ownership transfer succeded (no std::bad_alloc), then node is still valid to use, and at the end
|
||||
* of FA_Container lifetime it is guaranteed to be deleted
|
||||
*/
|
||||
void FA_Container::registerNew(FA_Node *node) {
|
||||
try {
|
||||
node->nodeId = (int64_t)all.size();
|
||||
all.push_back(node);
|
||||
} catch (const std::bad_alloc& ba) {
|
||||
delete node;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
DFA_CrossroadPath::DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node): input(input),nxt_node(nxt_node) {}
|
||||
//
|
||||
|
||||
FA_Container::~FA_Container() {
|
||||
for (FA_Node* n: all)
|
||||
delete n;
|
||||
}
|
||||
|
||||
#define bs(name, args, params) \
|
||||
FA_NodeOf ## name *FA_Container::make ## name(args) { \
|
||||
FA_NodeOf ## name *node = new FA_NodeOf ## name(params); \
|
||||
registerNew(node); \
|
||||
return node; \
|
||||
}
|
||||
#define COMMA ,
|
||||
|
||||
bs(Match, , )
|
||||
bs(OneCharRead, const codeset_t& filter COMMA bool second_namespace, filter COMMA second_namespace)
|
||||
bs(Forking, , )
|
||||
bs(LookOneBehind, const codeset_t& filter, filter)
|
||||
bs(LookOneAhead, const codeset_t& filter, filter)
|
||||
bs(TrackArrayMovImm, regex024_opcode operation COMMA uint16_t key COMMA uint64_t immValue,
|
||||
operation COMMA key COMMA immValue)
|
||||
bs(TrackArrayMovHalfinvariant, regex024_opcode operation COMMA uint16_t key, operation COMMA key)
|
||||
bs(DetCharCrossroads, ,{})
|
149
src/libregexis024fa/finite_automaton.h
Normal file
149
src/libregexis024fa/finite_automaton.h
Normal file
@ -0,0 +1,149 @@
|
||||
#ifndef LIBREGEXIS024_FINITE_AUTOMATON_H
|
||||
#define LIBREGEXIS024_FINITE_AUTOMATON_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <libregexis024fa/codeset.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
|
||||
enum FA_Node_type: uint8_t {
|
||||
match,
|
||||
one_char_read,
|
||||
forking,
|
||||
look_one_behind,
|
||||
look_one_ahead,
|
||||
track_array_mov_imm,
|
||||
track_array_mov_halfinvariant,
|
||||
/* Used for DFA */
|
||||
det_char_crossroads,
|
||||
};
|
||||
|
||||
struct FA_Node{
|
||||
size_t refs = 0;
|
||||
/* If node is not in searched subset (at least yet), `search mark == -1`, otherwise
|
||||
* it is an index (for that particular node) in the vector that captures all nodes in
|
||||
* searched subset*/
|
||||
int64_t search_mark = -1;
|
||||
FA_Node_type type;
|
||||
int64_t nodeId;
|
||||
|
||||
bool empty();
|
||||
virtual std::vector<FA_Node**> get_all_empty_valid_transitions();
|
||||
virtual void apply_lookahead_restriction(const codeset_t &restriction);
|
||||
void reAdd_references();
|
||||
virtual ~FA_Node() = default;
|
||||
virtual std::vector<FA_Node**> get_all_transitions();
|
||||
};
|
||||
|
||||
struct FA_NodePathPart: public FA_Node{
|
||||
FA_Node* nxt_node = NULL;
|
||||
|
||||
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
|
||||
std::vector<FA_Node **> get_all_transitions() override;
|
||||
};
|
||||
|
||||
struct FA_NodeOfMatch: public FA_Node{
|
||||
bool ext_filter_added = false;
|
||||
codeset_t pending_filter;
|
||||
|
||||
explicit FA_NodeOfMatch();
|
||||
void apply_lookahead_restriction(const codeset_t &restriction) override;
|
||||
};
|
||||
|
||||
/* .type == one_char_read */
|
||||
struct FA_NodeOfOneCharRead: public FA_NodePathPart{
|
||||
codeset_t filter;
|
||||
bool second_ns = false;
|
||||
|
||||
FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace);
|
||||
void apply_lookahead_restriction(const codeset_t &restriction) override;
|
||||
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
|
||||
};
|
||||
|
||||
/* .type == forking */
|
||||
struct FA_NodeOfForking: public FA_Node{
|
||||
/* Won't be modified after init (in regexp compilation into NFA) */
|
||||
std::vector<FA_Node*> nxt_options;
|
||||
int64_t stopId = -1;
|
||||
|
||||
explicit FA_NodeOfForking();
|
||||
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
|
||||
std::vector<FA_Node **> get_all_transitions() override;
|
||||
};
|
||||
|
||||
/* .type == look_one_behind */
|
||||
struct FA_NodeOfLookOneBehind: public FA_NodePathPart{
|
||||
/* [0; UINT32_MAX] is equivalent to no filter */
|
||||
codeset_t filter;
|
||||
|
||||
explicit FA_NodeOfLookOneBehind(const codeset_t &filter);
|
||||
};
|
||||
|
||||
/* .type == look_one_ahead */
|
||||
struct FA_NodeOfLookOneAhead: public FA_NodePathPart{
|
||||
/* [0; UINT32_MAX] is equivalent to no restriction */
|
||||
codeset_t restriction;
|
||||
|
||||
explicit FA_NodeOfLookOneAhead(const codeset_t &restriction);
|
||||
};
|
||||
|
||||
/* .type == track_array_mov_imm */
|
||||
struct FA_NodeOfTrackArrayMovImm: public FA_NodePathPart{
|
||||
regex024_opcode operation;
|
||||
uint16_t key;
|
||||
uint64_t imm_value;
|
||||
|
||||
FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue);
|
||||
};
|
||||
|
||||
/* .type == track_array_mov_halfinvariant */
|
||||
struct FA_NodeOfTrackArrayMovHalfinvariant: public FA_NodePathPart{
|
||||
regex024_opcode operation;
|
||||
uint16_t key;
|
||||
|
||||
FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key);
|
||||
};
|
||||
|
||||
struct DFA_CrossroadPath{
|
||||
codeset_t input;
|
||||
FA_Node* nxt_node = NULL;
|
||||
|
||||
DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node);
|
||||
DFA_CrossroadPath() = default;
|
||||
};
|
||||
|
||||
/* .type == det_char_crossroads */
|
||||
struct FA_NodeOfDetCharCrossroads: public FA_Node{
|
||||
std::vector<DFA_CrossroadPath> crossroads;
|
||||
bool matching = false;
|
||||
bool second_ns = false;
|
||||
|
||||
explicit FA_NodeOfDetCharCrossroads(const std::vector<DFA_CrossroadPath> &crossroads);
|
||||
void apply_lookahead_restriction(const codeset_t &restriction) override;
|
||||
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
|
||||
std::vector<FA_Node **> get_all_transitions() override;
|
||||
};
|
||||
|
||||
struct FA_Container{
|
||||
FA_Container(const FA_Container&) = delete;
|
||||
FA_Container& operator=(const FA_Container&) = delete;
|
||||
FA_Container() = default;
|
||||
|
||||
std::vector<FA_Node*> all;
|
||||
FA_Node* start = NULL;
|
||||
|
||||
void registerNew(FA_Node* node);
|
||||
|
||||
FA_NodeOfMatch* makeMatch();
|
||||
FA_NodeOfOneCharRead* makeOneCharRead(const codeset_t& filter, bool second_namespace);
|
||||
FA_NodeOfForking* makeForking();
|
||||
FA_NodeOfLookOneBehind* makeLookOneBehind(const codeset_t& filter);
|
||||
FA_NodeOfLookOneAhead* makeLookOneAhead(const codeset_t& filter);
|
||||
FA_NodeOfTrackArrayMovImm* makeTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue);
|
||||
FA_NodeOfTrackArrayMovHalfinvariant* makeTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key);
|
||||
FA_NodeOfDetCharCrossroads* makeDetCharCrossroads();
|
||||
|
||||
~FA_Container();
|
||||
};
|
||||
|
||||
#endif //LIBREGEXIS024_FINITE_AUTOMATON_H
|
117
src/libregexis024fa/graph_to_bytecode/core.cpp
Normal file
117
src/libregexis024fa/graph_to_bytecode/core.cpp
Normal file
@ -0,0 +1,117 @@
|
||||
#include <libregexis024fa/graph_to_bytecode/core.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
|
||||
|
||||
#include <libregexis024fa/graph_to_bytecode/filter.h>
|
||||
|
||||
#define nonthrowing_assert(expr) if (!(expr)) {error = -1; return; }
|
||||
|
||||
void compilation_core(std::vector<uint8_t>& result, FA_Container& fa, explicit_bookmarks& bookmark_manager,
|
||||
size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error)
|
||||
{
|
||||
bookmark_id_t node_start_bm_offset = bookmark_manager.new_range_of_bookmarks(fa.all.size());
|
||||
std::vector<size_t> not_yet_dedicated_second_read_ns_ssids;
|
||||
first_read_ns = 0;
|
||||
second_read_ns = 0;
|
||||
fork_ss_ns = 0;
|
||||
assert(fa.start);
|
||||
std::vector<FA_Node*> todo = {fa.start};
|
||||
// std::vector<bool> promised(fa.all.size(), false);
|
||||
// promised[fa.start->nodeId] = true;
|
||||
|
||||
auto nodesBookmark = [&](FA_Node* node) -> bookmark_id_t {
|
||||
assert(node);
|
||||
return node_start_bm_offset + node->nodeId;
|
||||
};
|
||||
|
||||
auto addBranching = [&](FA_Node* node) {
|
||||
todo.push_back(node);
|
||||
};
|
||||
|
||||
auto reading_head = [&](bool is_in_second_ns) {
|
||||
if (is_in_second_ns) {
|
||||
cmd_READ_second_ns(result, not_yet_dedicated_second_read_ns_ssids);
|
||||
second_read_ns++;
|
||||
} else {
|
||||
cmd_READ_first_ns(result, first_read_ns++);
|
||||
}
|
||||
};
|
||||
|
||||
while (!todo.empty()) {
|
||||
FA_Node* node = todo.back(); todo.pop_back();
|
||||
if (bookmark_manager.has_landed(nodesBookmark(node))) {
|
||||
continue;
|
||||
}
|
||||
while (true) {
|
||||
if (bookmark_manager.has_landed(nodesBookmark(node))) {
|
||||
cmd_JUMP(result, bookmark_manager, nodesBookmark(node));
|
||||
break;
|
||||
}
|
||||
bookmark_manager.land_bookmark(result, nodesBookmark(node));
|
||||
if (node->type == match) {
|
||||
cmd_MATCH(result);
|
||||
cmd_DIE(result);
|
||||
break;
|
||||
} else if (node->type == one_char_read) {
|
||||
FA_NodeOfOneCharRead* ocr = dynamic_cast<FA_NodeOfOneCharRead*>(node);
|
||||
nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX);
|
||||
reading_head(ocr->second_ns);
|
||||
write_filter(result, bookmark_manager, {ocr->filter},{nodesBookmark(ocr->nxt_node)});
|
||||
node = ocr->nxt_node;
|
||||
} else if (node->type == look_one_behind) {
|
||||
FA_NodeOfLookOneBehind* lob = dynamic_cast<FA_NodeOfLookOneBehind*>(node);
|
||||
write_filter(result, bookmark_manager, {lob->filter}, {nodesBookmark(lob->nxt_node)});
|
||||
node = lob->nxt_node;
|
||||
} else if (node->type == forking) {
|
||||
FA_NodeOfForking* fn = dynamic_cast<FA_NodeOfForking*>(node);
|
||||
std::vector<FA_Node*>& nxt_options = fn->nxt_options;
|
||||
if (nxt_options.empty()) {
|
||||
cmd_DIE(result);
|
||||
break;
|
||||
}
|
||||
if (nxt_options.size() >= 2) {
|
||||
nonthrowing_assert(fork_ss_ns < UINT32_MAX);
|
||||
regex_sslot_id_t sslot = fork_ss_ns++;
|
||||
for (size_t i = 0; i + 1 < nxt_options.size(); i++) {
|
||||
cmd_FORK(result, bookmark_manager, sslot, nodesBookmark(nxt_options[i]));
|
||||
addBranching(nxt_options[i]);
|
||||
}
|
||||
}
|
||||
node = nxt_options.back();
|
||||
} else if (node->type == track_array_mov_imm) {
|
||||
FA_NodeOfTrackArrayMovImm* tami = dynamic_cast<FA_NodeOfTrackArrayMovImm*>(node);
|
||||
write_byte(result, tami->operation);
|
||||
write_tai(result, tami->key);
|
||||
write_quadword(result, tami->imm_value);
|
||||
node = tami->nxt_node;
|
||||
} else if (node->type == track_array_mov_halfinvariant) {
|
||||
FA_NodeOfTrackArrayMovHalfinvariant* tamh = dynamic_cast<FA_NodeOfTrackArrayMovHalfinvariant *>(node);
|
||||
write_byte(result, tamh->operation);
|
||||
write_tai(result, tamh->key);
|
||||
node = tamh->nxt_node;
|
||||
} else if (node->type == det_char_crossroads) {
|
||||
FA_NodeOfDetCharCrossroads* dcc = dynamic_cast<FA_NodeOfDetCharCrossroads*>(node);
|
||||
nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX);
|
||||
if (dcc->matching)
|
||||
cmd_MATCH(result);
|
||||
reading_head(dcc->second_ns);
|
||||
std::vector<codeset_t> codesets;
|
||||
std::vector<bookmark_id_t> branches;
|
||||
for (const DFA_CrossroadPath& p: dcc->crossroads) {
|
||||
codesets.push_back(p.input);
|
||||
branches.push_back(nodesBookmark(p.nxt_node));
|
||||
addBranching(p.nxt_node);
|
||||
}
|
||||
write_filter(result, bookmark_manager, codesets, branches);
|
||||
if (dcc->crossroads.empty())
|
||||
break;
|
||||
node = dcc->crossroads[0].nxt_node;
|
||||
} else
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
for (size_t j = 0; j < not_yet_dedicated_second_read_ns_ssids.size(); j++) {
|
||||
belated_sslot_id(result, not_yet_dedicated_second_read_ns_ssids[j], j + first_read_ns);
|
||||
}
|
||||
}
|
10
src/libregexis024fa/graph_to_bytecode/core.h
Normal file
10
src/libregexis024fa/graph_to_bytecode/core.h
Normal file
@ -0,0 +1,10 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_CORE_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_CORE_H
|
||||
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
|
||||
|
||||
void compilation_core(std::vector<uint8_t>& result, FA_Container& fa, explicit_bookmarks& bookmark_manager,
|
||||
size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error);
|
||||
|
||||
#endif
|
102
src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp
Normal file
102
src/libregexis024fa/graph_to_bytecode/fa_compiler.cpp
Normal file
@ -0,0 +1,102 @@
|
||||
#include <libregexis024fa/graph_to_bytecode/fa_compiler.h>
|
||||
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
|
||||
#include <assert.h>
|
||||
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
|
||||
|
||||
#include <libregexis024fa/graph_to_bytecode/core.h>
|
||||
|
||||
void write_priority_table_actions(std::vector<uint8_t>& result, RegexPriorityTable &priority_table) {
|
||||
for (RegexPriorityTableAction& act: priority_table) {
|
||||
if (act.pos.isForRange()) {
|
||||
write_byte(result, regex024_opcodes::DDIST_RABX_SELARR);
|
||||
write_tai(result, act.pos.first);
|
||||
write_tai(result, act.pos.second);
|
||||
} else {
|
||||
write_byte(result, regex024_opcodes::DMOV_RABX_SELARR);
|
||||
write_tai(result, act.pos.first);
|
||||
}
|
||||
write_byte(result, act.minimize ?
|
||||
regex024_opcodes::SIFTPRIOR_MIN_RABX :
|
||||
regex024_opcodes::SIFTPRIOR_MAX_RABX);
|
||||
}
|
||||
write_byte(result, regex024_opcodes::SIFT_DONE);
|
||||
}
|
||||
|
||||
struct belate_initialization_parameters {
|
||||
size_t todo_pos_read_ss_n;
|
||||
size_t todo_pos_fork_ss_n;
|
||||
size_t todo_pos_second_ns_size;
|
||||
|
||||
void complete_it(std::vector<uint8_t>& result,
|
||||
regex_sslot_id_t first_read_ns, regex_sslot_id_t second_read_ns, regex_sslot_id_t fork_ss_ns)
|
||||
{
|
||||
assert((uint64_t)first_read_ns + (uint64_t)second_read_ns <= UINT32_MAX);
|
||||
belated_sslot_id(result, todo_pos_read_ss_n , first_read_ns + second_read_ns);
|
||||
belated_sslot_id(result, todo_pos_fork_ss_n, fork_ss_ns);
|
||||
belated_sslot_id(result, todo_pos_second_ns_size, second_read_ns);
|
||||
}
|
||||
};
|
||||
|
||||
/* when I compile initializational part of program, I don't yet know what to put in
|
||||
* PARAM_READ_SS_NUMBER, PARAM_FORK_SS_NUMBER and MSG_FED_INPUT_EXTENDED (second namespace size).
|
||||
* These values are belate. */
|
||||
belate_initialization_parameters write_some_normal_initialization(std::vector<uint8_t>& result,
|
||||
size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1)
|
||||
{
|
||||
belate_initialization_parameters todo;
|
||||
|
||||
write_byte(result, regex024_opcodes::PARAM_READ_SS_NUMBER);
|
||||
todo.todo_pos_read_ss_n = result.size();
|
||||
write_sslot_id(result, 0); // Belate
|
||||
|
||||
write_byte(result, regex024_opcodes::PARAM_FORK_SS_NUMBER);
|
||||
todo.todo_pos_fork_ss_n = result.size();
|
||||
write_sslot_id(result, 0); // Belate
|
||||
|
||||
write_byte(result, regex024_opcodes::PARAM_SELARR_LEN);
|
||||
write_tai(result, selarr_size);
|
||||
|
||||
write_byte(result, regex024_opcodes::MSG_MULTISTART_ALLOWED);
|
||||
write_byte(result, 1);
|
||||
|
||||
write_byte(result, regex024_opcodes::MSG_FED_INPUT_EXTENDED);
|
||||
write_byte(result, info1.fed_chars_extend_one_left ? 1 : 0);
|
||||
write_byte(result, info1.fed_chars_extend_one_right ? 1 : 0);
|
||||
todo.todo_pos_second_ns_size = result.size();
|
||||
write_sslot_id(result, 0); // Belate
|
||||
|
||||
write_byte(result, regex024_opcodes::INIT);
|
||||
return todo;
|
||||
}
|
||||
|
||||
void compile_fa_to_regexis024_bytecode(std::vector<uint8_t>& result,
|
||||
FA_Container &fa, RegexPriorityTable &priority_table,
|
||||
size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error)
|
||||
{
|
||||
error = 0;
|
||||
explicit_bookmarks bookmark_manager;
|
||||
|
||||
if (!priority_table.empty()) {
|
||||
bookmark_id_t BM_sift_function = bookmark_manager.new_bookmark();
|
||||
bookmark_id_t BM_after_sift = bookmark_manager.new_bookmark();
|
||||
|
||||
cmd_JUMP(result, bookmark_manager, BM_after_sift);
|
||||
bookmark_manager.land_bookmark(result, BM_sift_function);
|
||||
write_priority_table_actions(result, priority_table);
|
||||
bookmark_manager.land_bookmark(result, BM_after_sift);
|
||||
|
||||
write_byte(result, regex024_opcodes::PARAM_COLSIFTFUNC_SET);
|
||||
bookmark_manager.write_unresolved_reference(result, BM_sift_function);
|
||||
}
|
||||
|
||||
belate_initialization_parameters init_param_todo = write_some_normal_initialization(result, selarr_size, info1);
|
||||
|
||||
size_t first_read_ns, second_read_ns, fork_ss_ns;
|
||||
compilation_core(result, fa, bookmark_manager, first_read_ns, second_read_ns, fork_ss_ns, error);
|
||||
if (error < 0)
|
||||
return;
|
||||
init_param_todo.complete_it(result, first_read_ns, second_read_ns, fork_ss_ns);
|
||||
bookmark_manager.finish(result);
|
||||
}
|
14
src/libregexis024fa/graph_to_bytecode/fa_compiler.h
Normal file
14
src/libregexis024fa/graph_to_bytecode/fa_compiler.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_FA_COMPILER_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_FA_COMPILER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <libregexis024fa/selarr_priority_table.h>
|
||||
#include <libregexis024fa/fa_first_stage_fix.h>
|
||||
|
||||
void compile_fa_to_regexis024_bytecode(std::vector<uint8_t>& result, FA_Container& fa, RegexPriorityTable& priority_table,
|
||||
size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error);
|
||||
|
||||
#endif
|
||||
|
120
src/libregexis024fa/graph_to_bytecode/filter.cpp
Normal file
120
src/libregexis024fa/graph_to_bytecode/filter.cpp
Normal file
@ -0,0 +1,120 @@
|
||||
#include <libregexis024fa/graph_to_bytecode/filter.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <algorithm>
|
||||
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
|
||||
|
||||
std::vector<FilterSegment> convert_to_compSeg(const std::vector<codeset_t>& crossroad_codesets)
|
||||
{
|
||||
std::vector<FilterSegment> compSeg;
|
||||
std::vector<FilterSegment> seg;
|
||||
for (size_t i = 0; i < crossroad_codesets.size(); i++) {
|
||||
for (auto& p: crossroad_codesets[i]) {
|
||||
seg.emplace_back(i, p.first, p.second);
|
||||
}
|
||||
}
|
||||
std::sort(seg.begin(), seg.end(),
|
||||
[](const FilterSegment& a, const FilterSegment& b)->bool{return a.L < b.L;});
|
||||
if (seg.empty()) {
|
||||
compSeg.emplace_back(-1, 0, UINT32_MAX);
|
||||
} else {
|
||||
if (seg[0].L > 0)
|
||||
compSeg.emplace_back(-1, 0, seg[0].L - 1);
|
||||
size_t N = seg.size();
|
||||
for (size_t i = 0; i + 1 < N; i++) {
|
||||
compSeg.push_back(seg[i]);
|
||||
assert(seg[i].R < seg[i + 1].L);
|
||||
if (seg[i].R + 1 < seg[i + 1].L)
|
||||
compSeg.emplace_back(-1, seg[i].R + 1, seg[i + 1].L - 1);
|
||||
}
|
||||
compSeg.push_back(seg.back());
|
||||
if (seg.back().R < UINT32_MAX)
|
||||
compSeg.emplace_back(-1, seg[N - 1].R + 1, UINT32_MAX);
|
||||
}
|
||||
assert(!compSeg.empty());
|
||||
return compSeg;
|
||||
}
|
||||
|
||||
/* Return whether the resulting bytecode relies on me placing [0]'th node at the end */
|
||||
void write_filter_exit(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager,
|
||||
const std::vector<bookmark_id_t>& crossroad_marks,
|
||||
ssize_t color, bool at_the_end, bool& relies_on_proper_ending)
|
||||
{
|
||||
if (color < 0) {
|
||||
cmd_DIE(result);
|
||||
} else if (color != 0 || !at_the_end) {
|
||||
cmd_JUMP(result, bookmark_manager, crossroad_marks[color]);
|
||||
} else {
|
||||
relies_on_proper_ending = true;
|
||||
}
|
||||
}
|
||||
|
||||
// todo: use return value of this function
|
||||
bool write_filter(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager,
|
||||
const std::vector<codeset_t>& crossroad_codesets, const std::vector<bookmark_id_t>& crossroad_marks)
|
||||
{
|
||||
bool relies_on_proper_ending = false;
|
||||
|
||||
std::vector<FilterSegment> compSeg = convert_to_compSeg(crossroad_codesets);
|
||||
size_t N = compSeg.size();
|
||||
struct RecFrame {
|
||||
size_t Li;
|
||||
size_t Ri;
|
||||
bool second_part = false;
|
||||
bookmark_id_t to_the_right_part;
|
||||
|
||||
RecFrame(size_t li, size_t ri): Li(li),Ri(ri) {}
|
||||
};
|
||||
|
||||
std::vector<RecFrame> call_stack = {RecFrame(0, N - 1)};
|
||||
|
||||
auto is_sandwich = [&](size_t Li, size_t Ri) -> bool {
|
||||
return Li + 2 == Ri && compSeg[Li].color == compSeg[Ri].color && compSeg[Li + 1].L == compSeg[Li + 1].R;
|
||||
};
|
||||
|
||||
while (!call_stack.empty()) {
|
||||
RecFrame& cur_frame = call_stack.back();
|
||||
size_t Li = cur_frame.Li;
|
||||
size_t Ri = cur_frame.Ri;
|
||||
if (Li == Ri) {
|
||||
write_filter_exit(result, bookmark_manager, crossroad_marks, compSeg[Li].color,
|
||||
Ri + 1 == N, relies_on_proper_ending);
|
||||
call_stack.pop_back();
|
||||
} else if (is_sandwich(Li, Ri)){
|
||||
ssize_t A = compSeg[Li].color;
|
||||
ssize_t B = compSeg[Li + 1].color;
|
||||
size_t midVal = compSeg[Li + 1].L;
|
||||
if (B < 0) {
|
||||
assert(A >= 0);
|
||||
bookmark_id_t b_to_end = bookmark_manager.new_bookmark();
|
||||
cmd_JCEQUAL(result, bookmark_manager, midVal, b_to_end);
|
||||
cmd_JUMP(result, bookmark_manager, crossroad_marks[A]);
|
||||
bookmark_manager.land_bookmark(result, b_to_end);
|
||||
cmd_DIE(result);
|
||||
} else {
|
||||
cmd_JCEQUAL(result, bookmark_manager, midVal, crossroad_marks[B]);
|
||||
write_filter_exit(result, bookmark_manager, crossroad_marks, A,
|
||||
Ri + 1 == N, relies_on_proper_ending);
|
||||
}
|
||||
call_stack.pop_back();
|
||||
} else {
|
||||
size_t m = (Li + Ri) / 2;
|
||||
if (!cur_frame.second_part) {
|
||||
cur_frame.to_the_right_part = bookmark_manager.new_bookmark();
|
||||
cmd_JCGRTR(result, bookmark_manager, compSeg[m].R, cur_frame.to_the_right_part);
|
||||
cur_frame.second_part = true;
|
||||
/* cur_frame was just invalidated */
|
||||
call_stack.emplace_back(Li, m);
|
||||
} else {
|
||||
bookmark_manager.land_bookmark(result, cur_frame.to_the_right_part);
|
||||
/* cur_frame was invalidated */
|
||||
call_stack.pop_back();
|
||||
call_stack.emplace_back(m + 1, Ri);
|
||||
}
|
||||
}
|
||||
}
|
||||
return relies_on_proper_ending;
|
||||
}
|
||||
|
||||
FilterSegment::FilterSegment(ssize_t color, uint32_t l, uint32_t r): color(color), L(l), R(r) {}
|
||||
//
|
21
src/libregexis024fa/graph_to_bytecode/filter.h
Normal file
21
src/libregexis024fa/graph_to_bytecode/filter.h
Normal file
@ -0,0 +1,21 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_FILTER_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_FILTER_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <libregexis024fa/codeset.h>
|
||||
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
|
||||
|
||||
struct FilterSegment {
|
||||
ssize_t color;
|
||||
uint32_t L, R;
|
||||
|
||||
FilterSegment(ssize_t color, uint32_t l, uint32_t r);
|
||||
};
|
||||
|
||||
/* Return whether user of function must place [0]'th option after the filter
|
||||
* The filter can end up being written in such a way that the end will never be reached */
|
||||
bool write_filter(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager,
|
||||
const std::vector<codeset_t>& crossroad_codesets, const std::vector<bookmark_id_t>& crossroad_marks);
|
||||
|
||||
#endif
|
115
src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp
Normal file
115
src/libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
|
||||
#include <assert.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
|
||||
#define push_to_res_least_signif result.push_back(x & 0xffLU); x >>= 8
|
||||
|
||||
void write_byte(std::vector<uint8_t>& result, uint8_t x) {
|
||||
result.push_back(x);
|
||||
}
|
||||
|
||||
void write_word(std::vector<uint8_t>& result, uint16_t x) {
|
||||
push_to_res_least_signif; push_to_res_least_signif;
|
||||
}
|
||||
|
||||
void write_doubleword(std::vector<uint8_t>& result, uint32_t x) {
|
||||
push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif;
|
||||
}
|
||||
|
||||
void write_quadword(std::vector<uint8_t>& result, uint64_t x) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
push_to_res_least_signif;
|
||||
}
|
||||
}
|
||||
#undef push_to_res_least_signif
|
||||
|
||||
#define put_belated_to_res assert(result[pos] == 0); result[pos++] = value & 0xffLU; value >>= 8
|
||||
void belated_byte(std::vector<uint8_t>& result, size_t pos, uint8_t value) {
|
||||
assert(pos < result.size());
|
||||
result[pos] = value;
|
||||
}
|
||||
|
||||
void belated_word(std::vector<uint8_t>& result, size_t pos, uint16_t value) {
|
||||
assert(pos + 2 <= result.size());
|
||||
put_belated_to_res; put_belated_to_res;
|
||||
}
|
||||
|
||||
void belated_doubleword(std::vector<uint8_t>& result, size_t pos, uint32_t value) {
|
||||
assert(pos + 4 <= result.size());
|
||||
put_belated_to_res; put_belated_to_res; put_belated_to_res; put_belated_to_res;
|
||||
}
|
||||
|
||||
void belated_quadword(std::vector<uint8_t>& result, size_t pos, uint64_t value) {
|
||||
assert(pos + 8 <= result.size());
|
||||
for (int i = 0; i < 8; i++) {
|
||||
put_belated_to_res;
|
||||
}
|
||||
}
|
||||
#undef put_belated_to_res
|
||||
|
||||
void write_sslot_id(std::vector<uint8_t>& result, regex_sslot_id_t x) {
|
||||
write_doubleword(result, x);
|
||||
}
|
||||
|
||||
void write_tai(std::vector<uint8_t>& result, regex_tai_t x) {
|
||||
write_word(result, x);
|
||||
}
|
||||
|
||||
void write_near_ptr(std::vector<uint8_t>& result, regex_near_ptr_t x) {
|
||||
write_quadword(result, x);
|
||||
}
|
||||
|
||||
void belated_sslot_id(std::vector<uint8_t>& result, size_t pos, regex_sslot_id_t value) {
|
||||
belated_doubleword(result, pos, value);
|
||||
}
|
||||
|
||||
void belated_tai(std::vector<uint8_t>& result, size_t pos, regex_tai_t value) {
|
||||
belated_word(result, pos, value);
|
||||
}
|
||||
|
||||
void belated_near_ptr(std::vector<uint8_t>& result, size_t pos, regex_near_ptr_t value) {
|
||||
belated_quadword(result, pos, value);
|
||||
}
|
||||
|
||||
bookmark_id_t explicit_bookmarks::new_bookmark() {
|
||||
pile.emplace_back();
|
||||
return free_bid++;
|
||||
}
|
||||
|
||||
void explicit_bookmarks::write_unresolved_reference(std::vector<uint8_t> &result, bookmark_id_t bm) {
|
||||
size_t where_to_fill_later = result.size();
|
||||
write_near_ptr(result, 0);
|
||||
pile[bm].positions_of_belated_refs.push_back(where_to_fill_later);
|
||||
}
|
||||
|
||||
void explicit_bookmarks::land_bookmark(std::vector<uint8_t> &result, bookmark_id_t bm) {
|
||||
assert(!pile[bm].placed_somewhere);
|
||||
pile[bm].placed_somewhere = true;
|
||||
pile[bm].actual_position = result.size();
|
||||
}
|
||||
|
||||
void explicit_bookmarks::finish(std::vector<uint8_t> &result) {
|
||||
for (explicit_bookmark_info& bmi: pile) {
|
||||
assert(bmi.positions_of_belated_refs.empty() || bmi.placed_somewhere);
|
||||
if (bmi.placed_somewhere) {
|
||||
for (size_t ref_to_mine_belate: bmi.positions_of_belated_refs) {
|
||||
belated_near_ptr(result, ref_to_mine_belate, bmi.actual_position);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bookmark_id_t explicit_bookmarks::new_range_of_bookmarks(size_t n) {
|
||||
bookmark_id_t offset = free_bid;
|
||||
free_bid += n;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
pile.emplace_back();
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
bool explicit_bookmarks::has_landed(bookmark_id_t bm) {
|
||||
return pile[bm].placed_somewhere;
|
||||
}
|
||||
|
||||
#undef put_belated_to_res
|
@ -0,0 +1,63 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_NATURAL_COMPILER_UTILS_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_NATURAL_COMPILER_UTILS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <libregexis024vm/vm_opcodes_types.h>
|
||||
#include <vector>
|
||||
|
||||
void write_byte(std::vector<uint8_t>& result, uint8_t x);
|
||||
void write_word(std::vector<uint8_t>& result, uint16_t x);
|
||||
void write_doubleword(std::vector<uint8_t>& result, uint32_t x);
|
||||
void write_quadword(std::vector<uint8_t>& result, uint64_t x);
|
||||
|
||||
void belated_byte(std::vector<uint8_t>& result, size_t pos, uint8_t value);
|
||||
void belated_word(std::vector<uint8_t>& result, size_t pos, uint16_t value);
|
||||
void belated_doubleword(std::vector<uint8_t>& result, size_t pos, uint32_t value);
|
||||
void belated_quadword(std::vector<uint8_t>& result, size_t pos, uint64_t value);
|
||||
|
||||
|
||||
void write_sslot_id(std::vector<uint8_t>& result, regex_sslot_id_t x);
|
||||
void write_tai(std::vector<uint8_t>& result, regex_tai_t x);
|
||||
void write_near_ptr(std::vector<uint8_t>& result, regex_near_ptr_t x);
|
||||
|
||||
void belated_sslot_id(std::vector<uint8_t>& result, size_t pos, regex_sslot_id_t value);
|
||||
void belated_tai(std::vector<uint8_t>& result, size_t pos, regex_tai_t value);
|
||||
void belated_near_ptr(std::vector<uint8_t>& result, size_t pos, regex_near_ptr_t value);
|
||||
|
||||
// constexpr uint64_t INSTRUCTION_SZ = REGEX024_BYTECODE_INSTRUCTION_SZ;
|
||||
// constexpr uint64_t SSLOT_ID_SZ = REGEX024_BYTECODE_SSLOT_ID_SZ;
|
||||
// constexpr uint64_t TRACK_ARRAY_INDEX_ID_SZ = REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ;
|
||||
// constexpr uint64_t NEAR_POINTER_SZ = REGEX024_BYTECODE_NEAR_POINTER_SZ;
|
||||
|
||||
typedef size_t bookmark_id_t;
|
||||
|
||||
struct explicit_bookmark_info {
|
||||
std::vector<size_t> positions_of_belated_refs;
|
||||
bool placed_somewhere = false;
|
||||
size_t actual_position;
|
||||
};
|
||||
|
||||
struct explicit_bookmarks {
|
||||
bookmark_id_t free_bid = 0;
|
||||
/* For each named explicit bookmark there is an element in PILE */
|
||||
std::vector<explicit_bookmark_info> pile;
|
||||
|
||||
bookmark_id_t new_bookmark();
|
||||
|
||||
/* bm is the bookmark I refer to. Each bookmark has an id. It is like a name, but fits in 8 bytes */
|
||||
void write_unresolved_reference(std::vector<uint8_t>& result, bookmark_id_t bm);
|
||||
|
||||
/* bm is the bookmark I place into program `result` */
|
||||
void land_bookmark(std::vector<uint8_t>& result, bookmark_id_t bm);
|
||||
|
||||
/* call it at the very end of bytecode-building */
|
||||
void finish(std::vector<uint8_t>& result);
|
||||
|
||||
/* Returns offset of range of bookmark id's */
|
||||
bookmark_id_t new_range_of_bookmarks(size_t n);
|
||||
|
||||
bool has_landed(bookmark_id_t bm);
|
||||
};
|
||||
|
||||
|
||||
#endif
|
75
src/libregexis024fa/graph_to_bytecode/writing_commands.cpp
Normal file
75
src/libregexis024fa/graph_to_bytecode/writing_commands.cpp
Normal file
@ -0,0 +1,75 @@
|
||||
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <assert.h>
|
||||
|
||||
void cmd_JUMP(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest) {
|
||||
write_byte(result, regex024_opcodes::JUMP);
|
||||
bookmark_manager.write_unresolved_reference(result, dest);
|
||||
}
|
||||
|
||||
constexpr regex024_opcode cmp_EQUAL[4] = {regex024_opcodes::JCEQUAL_B, regex024_opcodes::JCEQUAL_W,
|
||||
regex024_opcodes::JCEQUAL_DW, regex024_opcodes::JCEQUAL_QW};
|
||||
constexpr regex024_opcode cmp_LESS[4] = {regex024_opcodes::JCLESS_B, regex024_opcodes::JCLESS_W,
|
||||
regex024_opcodes::JCLESS_DW, regex024_opcodes::JCLESS_QW};
|
||||
constexpr regex024_opcode cmp_GRTR[4] = {regex024_opcodes::JCGRTR_B, regex024_opcodes::JCGRTR_W,
|
||||
regex024_opcodes::JCGRTR_DW, regex024_opcodes::JCGRTR_QW};
|
||||
|
||||
|
||||
void cmd_JC(const regex024_opcode cmpT[4],
|
||||
std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest)
|
||||
{
|
||||
if (val <= UINT8_MAX) {
|
||||
write_byte(result, cmpT[0]);
|
||||
write_byte(result, static_cast<uint8_t>(val));
|
||||
} else if (val <= UINT16_MAX) {
|
||||
write_byte(result, cmpT[1]);
|
||||
write_word(result, static_cast<uint16_t>(val));
|
||||
} else if (val <= UINT32_MAX) {
|
||||
write_byte(result, cmpT[2]);
|
||||
write_doubleword(result, static_cast<uint32_t>(val));
|
||||
} else {
|
||||
write_byte(result, cmpT[3]);
|
||||
write_quadword(result, val);
|
||||
}
|
||||
bookmark_manager.write_unresolved_reference(result, dest);
|
||||
}
|
||||
|
||||
|
||||
void cmd_JCEQUAL(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) {
|
||||
cmd_JC(cmp_EQUAL, result, bookmark_manager, val, dest);
|
||||
}
|
||||
|
||||
void cmd_JCLESS(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) {
|
||||
cmd_JC(cmp_LESS, result, bookmark_manager, val, dest);
|
||||
}
|
||||
|
||||
void cmd_JCGRTR(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) {
|
||||
cmd_JC(cmp_GRTR, result, bookmark_manager, val, dest);
|
||||
}
|
||||
|
||||
void cmd_DIE(std::vector<uint8_t> &result) {
|
||||
write_byte(result, regex024_opcodes::DIE);
|
||||
}
|
||||
|
||||
void cmd_MATCH(std::vector<uint8_t> &result) {
|
||||
write_byte(result, regex024_opcodes::MATCH);
|
||||
}
|
||||
|
||||
void cmd_READ_first_ns(std::vector<uint8_t>& result, size_t slot) {
|
||||
assert(slot <= UINT32_MAX);
|
||||
write_byte(result, regex024_opcodes::READ);
|
||||
write_sslot_id(result, slot);
|
||||
}
|
||||
|
||||
void cmd_FORK(std::vector<uint8_t> &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest) {
|
||||
assert(slot <= UINT32_MAX);
|
||||
write_byte(result, regex024_opcodes::FORK);
|
||||
write_sslot_id(result, slot);
|
||||
bookmark_manager.write_unresolved_reference(result, dest);
|
||||
}
|
||||
|
||||
void cmd_READ_second_ns(std::vector<uint8_t>& result, std::vector<size_t>& belate_second_read_ns_slot_args) {
|
||||
write_byte(result, regex024_opcodes::READ);
|
||||
belate_second_read_ns_slot_args.push_back(result.size());
|
||||
write_sslot_id(result, 0);
|
||||
}
|
20
src/libregexis024fa/graph_to_bytecode/writing_commands.h
Normal file
20
src/libregexis024fa/graph_to_bytecode/writing_commands.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_WRITING_COMMANDS_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_WRITING_COMMANDS_H
|
||||
|
||||
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
|
||||
void cmd_JUMP(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest);
|
||||
|
||||
void cmd_JCEQUAL(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest);
|
||||
void cmd_JCLESS(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest);
|
||||
void cmd_JCGRTR(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest);
|
||||
|
||||
void cmd_DIE(std::vector<uint8_t>& result);
|
||||
void cmd_MATCH(std::vector<uint8_t>& result);
|
||||
|
||||
void cmd_READ_first_ns(std::vector<uint8_t>& result, size_t slot);
|
||||
void cmd_READ_second_ns(std::vector<uint8_t>& result, std::vector<size_t>& belate_second_read_ns_slot_args);
|
||||
void cmd_FORK(std::vector<uint8_t> &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest);
|
||||
|
||||
#endif
|
71
src/libregexis024fa/misc_fa_funcs.cpp
Normal file
71
src/libregexis024fa/misc_fa_funcs.cpp
Normal file
@ -0,0 +1,71 @@
|
||||
#include <libregexis024fa/misc_fa_funcs.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <assert.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
|
||||
void reattach_fa_node_edge(FA_Node **old_node_ptr, FA_Node *new_node) {
|
||||
assert(old_node_ptr);
|
||||
if (*old_node_ptr){
|
||||
assert((**old_node_ptr).refs);
|
||||
(**old_node_ptr).refs--;
|
||||
}
|
||||
if (new_node)
|
||||
new_node->refs++;
|
||||
*old_node_ptr = new_node;
|
||||
}
|
||||
|
||||
/* We basically reattch fa.start to node */
|
||||
void yay_new_start(FA_Container &fa, FA_NodePathPart *node) {
|
||||
assert(node);
|
||||
node->refs++;
|
||||
node->nxt_node = fa.start;
|
||||
fa.start = node;
|
||||
}
|
||||
|
||||
void add_option_to_fork_node(FA_NodeOfForking *fnode, FA_Node *transition_dest) {
|
||||
fnode->nxt_options.push_back(transition_dest);
|
||||
if(transition_dest)
|
||||
transition_dest->refs++;
|
||||
}
|
||||
|
||||
void reattach_nxt_node(FA_NodePathPart *node, FA_Node *dest) {
|
||||
reattach_fa_node_edge(&(node->nxt_node), dest);
|
||||
}
|
||||
|
||||
// todo: get rid of exitf in the whole project
|
||||
FA_Node* copy_node_no_container_adjustments(FA_Node& node){
|
||||
FA_Node* res;
|
||||
/* Using implicitly defined copy constructors */
|
||||
#define typeCase(etype, ctype) case etype: res = new ctype((ctype&)node); break;
|
||||
switch (node.type) {
|
||||
typeCase(match, FA_NodeOfMatch)
|
||||
typeCase(one_char_read, FA_NodeOfOneCharRead)
|
||||
typeCase(forking, FA_NodeOfForking)
|
||||
typeCase(look_one_behind, FA_NodeOfLookOneBehind)
|
||||
typeCase(look_one_ahead, FA_NodeOfLookOneAhead)
|
||||
typeCase(track_array_mov_imm, FA_NodeOfTrackArrayMovImm)
|
||||
typeCase(track_array_mov_halfinvariant, FA_NodeOfTrackArrayMovHalfinvariant)
|
||||
typeCase(det_char_crossroads, FA_NodeOfDetCharCrossroads)
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
#undef typeCase
|
||||
res->refs = 0;
|
||||
res->search_mark = -1;
|
||||
return res;
|
||||
}
|
||||
|
||||
/* In case when transferring the ownership of this new raw pointer has failed, node is destroyed, exception is thrown */
|
||||
FA_Node *copy_fa_node(FA_Node& node, FA_Container &fa) {
|
||||
FA_Node* res = copy_node_no_container_adjustments(node);
|
||||
/* Can invalidate ponter res (in which case it also throws exeption, so none of this matters in the end) */
|
||||
fa.registerNew(res);
|
||||
res->reAdd_references();
|
||||
return res;
|
||||
}
|
||||
|
||||
FA_Node *copy_fa_node_to_another_fa(FA_Node& node, FA_Container &resultFa) {
|
||||
FA_Node* res = copy_node_no_container_adjustments(node);
|
||||
resultFa.registerNew(res);
|
||||
return res;
|
||||
}
|
17
src/libregexis024fa/misc_fa_funcs.h
Normal file
17
src/libregexis024fa/misc_fa_funcs.h
Normal file
@ -0,0 +1,17 @@
|
||||
#ifndef LIBREGEXIS024_MISC_FA_FUNCS_H
|
||||
#define LIBREGEXIS024_MISC_FA_FUNCS_H
|
||||
|
||||
#include "finite_automaton.h"
|
||||
#include "fa_first_stage_fix.h"
|
||||
|
||||
FA_Node* copy_fa_node(FA_Node& node, FA_Container& fa);
|
||||
void yay_new_start(FA_Container& fa, FA_NodePathPart* node);
|
||||
void reattach_fa_node_edge(FA_Node** old_node_ptr, FA_Node* new_node);
|
||||
void add_option_to_fork_node(FA_NodeOfForking* fnode, FA_Node* transition_dest);
|
||||
void reattach_nxt_node(FA_NodePathPart* node, FA_Node* dest);
|
||||
|
||||
/* This is a one weird operation. New node in resultFa will still point to nodes in sourceFa,
|
||||
* without increasing refcount of those nodes. YOU HAVE TO FIX IT ASAP */
|
||||
FA_Node* copy_fa_node_to_another_fa(FA_Node& node, FA_Container& resultFa);
|
||||
|
||||
#endif //LIBREGEXIS024_MISC_FA_FUNCS_H
|
15
src/libregexis024fa/selarr_priority_table.cpp
Normal file
15
src/libregexis024fa/selarr_priority_table.cpp
Normal file
@ -0,0 +1,15 @@
|
||||
#include <libregexis024fa/selarr_priority_table.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
bool RegexPriorityTableAction_Pos::isForRange() const {
|
||||
return second >= 0;
|
||||
}
|
||||
|
||||
RegexPriorityTableAction_Pos::RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type):
|
||||
first(first),second(second), type(type) {}
|
||||
//
|
||||
|
||||
RegexPriorityTableAction::RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type):
|
||||
minimize(minimize), pos(first, second, type) {}
|
||||
//
|
26
src/libregexis024fa/selarr_priority_table.h
Normal file
26
src/libregexis024fa/selarr_priority_table.h
Normal file
@ -0,0 +1,26 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <libregexis024fa/tracking_variables.h>
|
||||
|
||||
struct RegexPriorityTableAction_Pos{
|
||||
/* first and second are indexes in selarr (but second can be -1 if it is unused) */
|
||||
int first;
|
||||
int second;
|
||||
tracking_var_type type;
|
||||
bool isForRange() const;
|
||||
|
||||
RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type);
|
||||
};
|
||||
|
||||
struct RegexPriorityTableAction{
|
||||
bool minimize;
|
||||
RegexPriorityTableAction_Pos pos;
|
||||
RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type);
|
||||
};
|
||||
|
||||
typedef std::vector<RegexPriorityTableAction> RegexPriorityTable;
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H
|
53
src/libregexis024fa/tracking_fa_nodes.cpp
Normal file
53
src/libregexis024fa/tracking_fa_nodes.cpp
Normal file
@ -0,0 +1,53 @@
|
||||
#include <libregexis024fa/tracking_fa_nodes.h>
|
||||
#include <assert.h>
|
||||
|
||||
bool isImmMovOpcode(regex024_opcode inst) {
|
||||
return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_SELARR_IMM;
|
||||
}
|
||||
|
||||
bool isCurPosMovOpcode(regex024_opcode inst) {
|
||||
return inst == regex024_opcodes::MOV_COLARR_BTPOS || inst == regex024_opcodes::MOV_SELARR_CHPOS;
|
||||
}
|
||||
|
||||
bool isColarrOpcode(regex024_opcode inst) {
|
||||
return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_COLARR_BTPOS;
|
||||
}
|
||||
|
||||
bool isSelarrOpcode(regex024_opcode inst) {
|
||||
return inst == regex024_opcodes::MOV_SELARR_IMM || inst == regex024_opcodes::MOV_SELARR_CHPOS;
|
||||
}
|
||||
|
||||
bool isTrackingFaNode(const FA_Node *n) {
|
||||
return n->type == track_array_mov_imm || n->type == track_array_mov_halfinvariant;
|
||||
}
|
||||
|
||||
TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value)
|
||||
: opcode(opcode), key(key), immValue(imm_value) {}
|
||||
|
||||
TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key)
|
||||
: opcode(opcode), key(key) {}
|
||||
|
||||
std::string TrackingOperationInFa::toString() const {
|
||||
switch (opcode){
|
||||
case regex024_opcodes::MOV_COLARR_IMM:
|
||||
return "colarr[" + std::to_string(key) + "] := " + std::to_string(immValue);
|
||||
case regex024_opcodes::MOV_SELARR_IMM:
|
||||
return "selarr[" + std::to_string(key) + "] := " + std::to_string(immValue);
|
||||
case regex024_opcodes::MOV_COLARR_BTPOS:
|
||||
return "colarr[" + std::to_string(key) + "] := cur byte position";
|
||||
case regex024_opcodes::MOV_SELARR_CHPOS:
|
||||
return "selarr[" + std::to_string(key) + "] := cur char position";
|
||||
default:
|
||||
return "wrong collection operation";
|
||||
}
|
||||
}
|
||||
|
||||
FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa) {
|
||||
if (isImmMovOpcode(op.opcode)) {
|
||||
return fa.makeTrackArrayMovImm(op.opcode, op.key, op.immValue);
|
||||
}
|
||||
assert(isCurPosMovOpcode(op.opcode));
|
||||
return fa.makeTrackArrayMovHalfinvariant(op.opcode, op.key);
|
||||
|
||||
}
|
||||
|
31
src/libregexis024fa/tracking_fa_nodes.h
Normal file
31
src/libregexis024fa/tracking_fa_nodes.h
Normal file
@ -0,0 +1,31 @@
|
||||
#ifndef LIBREGEXIS024_TRACKING_FA_NODES_H
|
||||
#define LIBREGEXIS024_TRACKING_FA_NODES_H
|
||||
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <string>
|
||||
|
||||
bool isImmMovOpcode(regex024_opcode inst);
|
||||
bool isCurPosMovOpcode(regex024_opcode inst);
|
||||
bool isColarrOpcode(regex024_opcode inst);
|
||||
bool isSelarrOpcode(regex024_opcode inst);
|
||||
|
||||
bool isTrackingFaNode(const FA_Node* n);
|
||||
|
||||
struct TrackingOperationInFa {
|
||||
regex024_opcode opcode;
|
||||
regex_tai_t key;
|
||||
/* Not needed for halfinvariant operations */
|
||||
uint64_t immValue;
|
||||
|
||||
TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value);
|
||||
|
||||
TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key);
|
||||
|
||||
std::string toString() const;
|
||||
};
|
||||
|
||||
FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa);
|
||||
|
||||
|
||||
#endif
|
14
src/libregexis024fa/tracking_variables.h
Normal file
14
src/libregexis024fa/tracking_variables.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H
|
||||
|
||||
namespace tracking_var_types {
|
||||
enum tracking_var_type_I {
|
||||
range,
|
||||
dot_cur_pos,
|
||||
dot_immediate
|
||||
};
|
||||
}
|
||||
|
||||
typedef tracking_var_types::tracking_var_type_I tracking_var_type;
|
||||
|
||||
#endif
|
62
src/libregexis024sol/backslash_expression.cpp
Normal file
62
src/libregexis024sol/backslash_expression.cpp
Normal file
@ -0,0 +1,62 @@
|
||||
#include <libregexis024sol/special_terminals.h>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
#include <assert.h>
|
||||
|
||||
uint32_t read_hex(REGEX_IS024_MeaningContext& ctx, int sz){
|
||||
uint32_t res = 0;
|
||||
for (int i = 0; i < sz; i++){
|
||||
int32_t ch = peep(ctx);
|
||||
if ('0' <= ch && ch <= '9')
|
||||
res = ((res << 4) | ((uint32_t)ch - '0'));
|
||||
else if ('a' <= ch && ch <= 'z')
|
||||
res = ((res << 4) | ((uint32_t)ch - 'a' + 10));
|
||||
else if ('A' <= ch && ch <= 'Z')
|
||||
res = ((res << 4) | ((uint32_t)ch - 'A' + 10));
|
||||
else{
|
||||
report(ctx, "escape backslash expression: bad unicode code");
|
||||
return 0;
|
||||
}
|
||||
readChar(ctx);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void unicode_in_bs_case(REGEX_IS024_MeaningContext &ctx, bool &ret_is_multicode, codeset_t &ret_set, int sz){
|
||||
ret_is_multicode = false;
|
||||
readChar(ctx);
|
||||
uint32_t hc = read_hex(ctx, sz); // Might create an error
|
||||
ret_set = codeset_of_one_char(hc);
|
||||
}
|
||||
|
||||
void
|
||||
backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc,
|
||||
bool &ret_is_multicode, codeset_t &ret_set)
|
||||
{
|
||||
int32_t leader = peep(ctx);
|
||||
if (ctx.error)
|
||||
return;
|
||||
#define block(l, b, E) case l: ret_is_multicode = b; ret_set = E; readChar(ctx); break;
|
||||
switch (leader) {
|
||||
block('s', false, codeset_of_one_char(U' '))
|
||||
block('t', false, codeset_of_one_char(U'\t'))
|
||||
block('n', false, codeset_of_one_char(U'\n'))
|
||||
block('r', false, codeset_of_one_char(U'\r'))
|
||||
block('e', true, cc.spaces);
|
||||
block('E', true, invert_set(cc.spaces))
|
||||
block('w', true, cc.word_constituents);
|
||||
block('W', true, invert_set(cc.word_constituents));
|
||||
case 'u':
|
||||
unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 4);
|
||||
break;
|
||||
case 'U':
|
||||
unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 8);
|
||||
break;
|
||||
default:
|
||||
if (leader >= 0){
|
||||
ret_is_multicode = false;
|
||||
ret_set = codeset_of_one_char(leader);
|
||||
} else {
|
||||
report(ctx, "backslash in the wrong place");
|
||||
}
|
||||
}
|
||||
}
|
143
src/libregexis024sol/command_expression.cpp
Normal file
143
src/libregexis024sol/command_expression.cpp
Normal file
@ -0,0 +1,143 @@
|
||||
#include <libregexis024sol/special_terminals.h>
|
||||
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
|
||||
struct ParseCall{
|
||||
virtual ~ParseCall() = default;
|
||||
virtual std::unique_ptr<ParseCall> afterReceive(REGEX_IS024_MeaningContext& ctx) { assert(false); }
|
||||
virtual std::unique_ptr<ParseCall> firstTime(REGEX_IS024_MeaningContext& ctx) { assert(false); }
|
||||
};
|
||||
|
||||
struct Top_ParseCall: public ParseCall{
|
||||
Command& res;
|
||||
explicit Top_ParseCall(Command &res) : res(res) {}
|
||||
std::unique_ptr<ParseCall> firstTime(REGEX_IS024_MeaningContext &ctx) override;
|
||||
std::unique_ptr<ParseCall> afterReceive(REGEX_IS024_MeaningContext &ctx) override;
|
||||
};
|
||||
|
||||
struct Bracker_ParseCall: public ParseCall{
|
||||
std::vector<CommandArgument>& res;
|
||||
bool closingBraceEnded = false;
|
||||
explicit Bracker_ParseCall(std::vector<CommandArgument> &res) : res(res) {}
|
||||
std::unique_ptr<ParseCall> argReadProc(REGEX_IS024_MeaningContext& ctx);
|
||||
std::unique_ptr<ParseCall> firstTime(REGEX_IS024_MeaningContext &ctx) override;
|
||||
std::unique_ptr<ParseCall> afterReceive(REGEX_IS024_MeaningContext &ctx) override;
|
||||
};
|
||||
|
||||
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
|
||||
#define call_THROW(str) do { report(ctx, "command expression: " str); return NULL; } while (0)
|
||||
|
||||
std::unique_ptr<ParseCall> Top_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) {
|
||||
assert(readChar(ctx) == U'!');
|
||||
int32_t ch = peep(ctx); call_ERROR_CHECK;
|
||||
if (ch == U'~'){
|
||||
/* I assume during construction I received reference to newly initialized struct */
|
||||
res.tilda = true;
|
||||
return NULL;
|
||||
}
|
||||
res.name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
|
||||
if (res.name.empty())
|
||||
call_THROW("top lvl: no command name specified");
|
||||
ch = peep(ctx); call_ERROR_CHECK;
|
||||
if (ch == U';'){
|
||||
readChar(ctx);
|
||||
return NULL;
|
||||
}
|
||||
if (ch == U'{'){
|
||||
return std::make_unique<Bracker_ParseCall>(res.arguments);
|
||||
}
|
||||
call_THROW("top lvl: command call should be ended with ';' or '{...}'");
|
||||
}
|
||||
|
||||
std::unique_ptr<ParseCall> Top_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
std::unique_ptr<ParseCall> Bracker_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) {
|
||||
assert(readChar(ctx) == U'{');
|
||||
return argReadProc(ctx);
|
||||
}
|
||||
|
||||
std::unique_ptr<ParseCall> Bracker_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) {
|
||||
closingBraceEnded = true;
|
||||
return argReadProc(ctx);
|
||||
}
|
||||
|
||||
std::unique_ptr<ParseCall> Bracker_ParseCall::argReadProc(REGEX_IS024_MeaningContext &ctx) {
|
||||
repeat:
|
||||
int32_t ch = peep(ctx); call_ERROR_CHECK;
|
||||
if (ch == U';'){
|
||||
res.emplace_back();
|
||||
readChar(ctx);
|
||||
closingBraceEnded = false;
|
||||
goto repeat;
|
||||
} else if (ch == U'}'){
|
||||
readChar(ctx);
|
||||
if (!closingBraceEnded){
|
||||
res.emplace_back();
|
||||
}
|
||||
return NULL;
|
||||
} else if (is_REGEX024_nameConstituent(ch)){
|
||||
res.emplace_back();
|
||||
res.back().is_empty = false;
|
||||
res.back().name = tryRead_REGEX024_name(ctx);
|
||||
int32_t eCh = peep(ctx); call_ERROR_CHECK;
|
||||
if (eCh == U';'){
|
||||
readChar(ctx);
|
||||
closingBraceEnded = false;
|
||||
goto repeat;
|
||||
} else if (eCh == U'{'){
|
||||
return std::make_unique<Bracker_ParseCall>(res.back().arguments);
|
||||
} else if (eCh == U'}'){
|
||||
readChar(ctx);
|
||||
return NULL;
|
||||
}
|
||||
call_THROW("brace lvl: argument ends with ';' or {...}");
|
||||
}
|
||||
call_THROW("brace lvl: argument starts with ';' or it's name");
|
||||
}
|
||||
|
||||
Command command_expr_parse(REGEX_IS024_MeaningContext &ctx) {
|
||||
std::vector<std::unique_ptr<ParseCall>> callStack;
|
||||
Command res;
|
||||
callStack.push_back(std::make_unique<Top_ParseCall>(res));
|
||||
bool first_time = true;
|
||||
while (!callStack.empty()){
|
||||
if (ctx.error)
|
||||
return {};
|
||||
auto nxt = first_time ? callStack.back()->firstTime(ctx) : callStack.back()->afterReceive(ctx);
|
||||
if (nxt){
|
||||
callStack.push_back(std::move(nxt));
|
||||
first_time = true;
|
||||
} else {
|
||||
callStack.pop_back();
|
||||
first_time = false;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
const char* commands_for_codesets[] = {"word", "space", "digit", "variable", "any", "A", NULL};
|
||||
|
||||
bool is_command_for_charset(const Command &cmd) {
|
||||
return !cmd.tilda && cmd.arguments.empty() && is_string_in_stringset(cmd.name.c_str(), commands_for_codesets);
|
||||
}
|
||||
|
||||
void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command &cmd, codeset_t& ret)
|
||||
{
|
||||
if (cmd.name == "word")
|
||||
ret = cc.word_constituents;
|
||||
else if (cmd.name == "space")
|
||||
ret = cc.spaces;
|
||||
else if (cmd.name == "digit")
|
||||
ret = cc.digits;
|
||||
else if (cmd.name == "variable")
|
||||
ret = cc.variable_constituents;
|
||||
else if (cmd.name == "any" || cmd.name == "A")
|
||||
ret = codeset_of_all;
|
||||
else
|
||||
assert(false);
|
||||
}
|
13
src/libregexis024sol/common_codesets.cpp
Normal file
13
src/libregexis024sol/common_codesets.cpp
Normal file
@ -0,0 +1,13 @@
|
||||
#include <libregexis024sol/common_codesets.h>
|
||||
|
||||
CommonCodesets::CommonCodesets() {
|
||||
spaces = set_add_char(spaces, U'\n');
|
||||
spaces = set_add_char(spaces, U' ');
|
||||
spaces = set_add_char(spaces, U'\t');
|
||||
spaces = set_add_char(spaces, U'\r');
|
||||
word_constituents = set_add_range(word_constituents, U'a', U'z');
|
||||
word_constituents = set_add_range(word_constituents, U'A', U'Z');
|
||||
digits = codeset_t({{'0', '9'}});
|
||||
variable_constituents = set_add_char(word_constituents, U'-');
|
||||
variable_constituents = merge_sets(variable_constituents, digits);
|
||||
}
|
14
src/libregexis024sol/common_codesets.h
Normal file
14
src/libregexis024sol/common_codesets.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_COMMON_CODESETS_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_COMMON_CODESETS_H
|
||||
|
||||
#include <libregexis024fa/codeset.h>
|
||||
|
||||
struct CommonCodesets {
|
||||
codeset_t spaces;
|
||||
codeset_t word_constituents;
|
||||
codeset_t digits;
|
||||
codeset_t variable_constituents;
|
||||
CommonCodesets();
|
||||
};
|
||||
|
||||
#endif
|
280
src/libregexis024sol/expr_compiler.cpp
Normal file
280
src/libregexis024sol/expr_compiler.cpp
Normal file
@ -0,0 +1,280 @@
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <libregexis024fa/codeset.h>
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <libregexis024fa/fa_first_stage_fix.h>
|
||||
#include <libregexis024fa/fa_make_deterministic.h>
|
||||
#include <libregexis024fa/misc_fa_funcs.h>
|
||||
#include <assert.h>
|
||||
#include <memory>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
#include <libregexis024sol/special_terminals.h>
|
||||
#include <libregexis024sol/subexpr_fa_transformed.h>
|
||||
#include <libregexis024sol/expr_parse_functions/epf.h>
|
||||
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
|
||||
#include <libregexis024fa/graph_to_bytecode/fa_compiler.h>
|
||||
#include <libregexis024sol/common_codesets.h>
|
||||
/* Temporary debug measures */
|
||||
#include <debugging_regexis024/debug_through_graphviz.h>
|
||||
|
||||
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
|
||||
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
|
||||
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
|
||||
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
|
||||
|
||||
/* ****************************** Top */
|
||||
|
||||
const char* dfa_arg_aliases_condone[] = {"forgive", "condone", "okay", "optional", "nonimportant", "ifpossible", NULL};
|
||||
const char* dfa_arg_aliases_acerbic[] = {"acerbic", "angry", "pedantic", "nofork", "pure", "important", "fierce", NULL};
|
||||
|
||||
void dfa_command_processing(REGEX_IS024_MeaningContext &ctx, ParsingContext& pctx, const Command& cmdBuf){
|
||||
if (pctx.dfa_cmd_activated){
|
||||
report(ctx, "repeating !dfa command");
|
||||
return;
|
||||
}
|
||||
pctx.dfa_cmd_activated = true;
|
||||
if (cmdBuf.arguments.empty())
|
||||
return;
|
||||
if (cmdBuf.arguments.size() == 1 && cmdBuf.arguments[0].arguments.empty()){
|
||||
const std::string& arg_name = cmdBuf.arguments[0].name;
|
||||
if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_acerbic)) {
|
||||
pctx.dfa_cmd_unforgiving = true;
|
||||
return;
|
||||
}
|
||||
if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_condone)) {
|
||||
pctx.dfa_cmd_nonimportant = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
report(ctx, "wrong arguments in !dfa command");
|
||||
}
|
||||
|
||||
void select_command_processing(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, const Command& cmdBuf){
|
||||
if (pctx.select_cmd_encountered)
|
||||
aux_THROW("repeating !select command");
|
||||
pctx.select_cmd_encountered = true;
|
||||
for (const CommandArgument& arg: cmdBuf.arguments){
|
||||
if (arg.is_empty)
|
||||
aux_THROW("wrong arguments in !select command");
|
||||
if (ctx.ktr.track_names.count(arg.name) != 0)
|
||||
aux_THROW("repeated names in !select command");
|
||||
int64_t namedThingId = static_cast<int64_t>(ctx.ktr.track_names.size());
|
||||
ctx.ktr.track_names.insert({arg.name, namedThingId});
|
||||
ctx.ktr.retrieval_info.emplace_back();
|
||||
ctx.ktr.retrieval_info.back().stored_in_sa = true;
|
||||
ctx.ktr.retrieval_info.back().stored_in_ca = false;
|
||||
bool mm = false, coll = false;
|
||||
for (const CommandArgument& argarg: arg.arguments){
|
||||
#define mm_shenanigans if (mm) {aux_THROW("bad argument to !select command");} mm = true;
|
||||
if (argarg.name == "ca" || argarg.name == "col") {
|
||||
if (coll)
|
||||
aux_THROW("bad argument to !select command");
|
||||
coll = true;
|
||||
ctx.ktr.retrieval_info.back().stored_in_ca = true;
|
||||
} else if (argarg.name == "min") {
|
||||
mm_shenanigans
|
||||
ctx.ktr.retrieval_info.back().used_in_sifting = true;
|
||||
ctx.ktr.retrieval_info.back().minimizing = true;
|
||||
} else if (argarg.name == "max"){
|
||||
mm_shenanigans
|
||||
ctx.ktr.retrieval_info.back().used_in_sifting = true;
|
||||
} else if (argarg.name == "ign") {
|
||||
mm_shenanigans
|
||||
} else {
|
||||
aux_THROW("wrong parameter for prioritized parameter in !select command");
|
||||
}
|
||||
#undef mm_shenanigans
|
||||
}
|
||||
pctx.is_inside_of_these_sa_subexpressions.assign(ctx.ktr.retrieval_info.size(), false);
|
||||
/* Other info will be filled once a tracking-unit with such name will be actually found in regex */
|
||||
}
|
||||
}
|
||||
|
||||
void jump_into_madness(ctx_t& ctx, ParsingContext& pctx, FA_Container &fa, int hn){
|
||||
while (true){
|
||||
int32_t pch = peep(ctx); aux_ERROR_CHECK;
|
||||
if (pch != U'!'){
|
||||
return;
|
||||
}
|
||||
size_t before_it = ctx.pos;
|
||||
Command cmd = command_expr_parse(ctx); aux_ERROR_CHECK;
|
||||
if (cmd.tilda){
|
||||
ctx.have_comment_tail = true;
|
||||
ctx.comment_tail_start = ctx.pos;
|
||||
ctx.pos = ctx.input_size;
|
||||
} else if (is_header_dfa_cmd(cmd)){
|
||||
dfa_command_processing(ctx, pctx, cmd);
|
||||
} else if (is_header_select_cmd(cmd)){
|
||||
if (hn != 1)
|
||||
aux_THROW("!select command at the wrong place");
|
||||
select_command_processing(ctx, pctx, cmd);
|
||||
} else {
|
||||
assert(!is_header_cmd(cmd));
|
||||
ctx.pos = before_it;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chekushka TopLvl_ParseCall::firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) {
|
||||
result.assertDefault();
|
||||
jump_into_madness(ctx, pctx, fa, 1);
|
||||
if (ctx.have_comment_tail)
|
||||
return NULL;
|
||||
return std::make_unique<ForkLvl_ParseCall>(result);
|
||||
}
|
||||
|
||||
chekushka TopLvl_ParseCall::afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) {
|
||||
jump_into_madness(ctx, pctx, fa, 2);
|
||||
if (!isEnd(ctx))
|
||||
call_THROW("top lvl: EOF expected");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ********************************* Bracket */
|
||||
|
||||
chekushka BracketLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
|
||||
result.assertDefault();
|
||||
assert(readChar(ctx) == U'(');
|
||||
/* sequence lvl already took care about resolving name and configuring SubtrackingNameInfo */
|
||||
if (namedSubexpressionId >= 0){
|
||||
assert(ctx.ktr.retrieval_info[namedSubexpressionId].type == tracking_var_types::range);
|
||||
if (ctx.ktr.retrieval_info[namedSubexpressionId].stored_in_sa){
|
||||
assert(namedSubexpressionId < (int64_t)pctx.is_inside_of_these_sa_subexpressions.size());
|
||||
if (pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId])
|
||||
call_THROW("subexpression that selection array tracks is nested");
|
||||
pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = true;
|
||||
}
|
||||
}
|
||||
return std::make_unique<ForkLvl_ParseCall>(tmp_ret_buff);
|
||||
}
|
||||
|
||||
chekushka BracketLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
|
||||
if (peep(ctx) != U')')
|
||||
call_THROW("missing ')'");
|
||||
readChar(ctx);
|
||||
result = tmp_ret_buff;
|
||||
if (namedSubexpressionId >= 0) {
|
||||
SubtrackingNameInfo& tai_slots = ctx.ktr.retrieval_info[namedSubexpressionId];
|
||||
if (tai_slots.stored_in_ca){
|
||||
assert(tai_slots.colarr_first >= 0 && tai_slots.colarr_first < UINT16_MAX);
|
||||
assert(tai_slots.colarr_second >= 0 && tai_slots.colarr_second < UINT16_MAX);
|
||||
result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
|
||||
regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_first)), result);
|
||||
result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
|
||||
regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_second)));
|
||||
}
|
||||
if (tai_slots.stored_in_sa){
|
||||
assert(tai_slots.selarr_first >= 0 && tai_slots.selarr_first < UINT16_MAX);
|
||||
assert(tai_slots.selarr_second >= 0 && tai_slots.selarr_second < UINT16_MAX);
|
||||
result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
|
||||
regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_first)), result);
|
||||
result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
|
||||
regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_second)));
|
||||
pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = false;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ******************************* Fork */
|
||||
|
||||
chekushka ForkLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
|
||||
result.assertDefault();
|
||||
options.emplace_back(); // Default one contains nothing. It will be overwritten
|
||||
return std::make_unique<Sequence_ParseCall>(options.back());
|
||||
}
|
||||
|
||||
chekushka ForkLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
|
||||
int32_t end_reason = peep(ctx); call_ERROR_CHECK;
|
||||
if (end_reason == U'|'){
|
||||
readChar(ctx);
|
||||
return firstTime(ctx, pctx, fa);
|
||||
}
|
||||
result = forkify(options, fa);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void parseBody(REGEX_IS024_MeaningContext& ctx, FA_Container& fa, SubExprCompiled& result, ParsingContext& pctx){
|
||||
std::vector<std::shared_ptr<ParseCall>> callStack;
|
||||
callStack.push_back(std::make_unique<TopLvl_ParseCall>(result));
|
||||
bool first_time = true;
|
||||
while (!callStack.empty()){
|
||||
aux_ERROR_CHECK;
|
||||
auto nxt = first_time ? callStack.back()->firstTime(ctx, pctx, fa) : \
|
||||
callStack.back()->afterReceive(ctx, pctx, fa);
|
||||
if (nxt){
|
||||
callStack.push_back(std::move(nxt));
|
||||
first_time = true;
|
||||
} else {
|
||||
callStack.pop_back();
|
||||
first_time = false;
|
||||
}
|
||||
}
|
||||
/* Generating priority table (sifting program) */
|
||||
for (const SubtrackingNameInfo& sni: ctx.ktr.retrieval_info) {
|
||||
if (!sni.discovered)
|
||||
aux_THROW("tracking tool named in !select is not used anywhere");
|
||||
if (sni.used_in_sifting) {
|
||||
assert(sni.selarr_first >= 0);
|
||||
assert((sni.type == tracking_var_types::range) == (sni.selarr_second != -1));
|
||||
pctx.priority_table.emplace_back(sni.minimizing, sni.selarr_first, sni.selarr_second, sni.type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REGEX_IS024_MeaningContext::REGEX_IS024_MeaningContext(size_t inputSize, const char *input) : input_size(inputSize),
|
||||
input(reinterpret_cast<const uint8_t *>(input)) {
|
||||
CommonCodesets codeset_collection;
|
||||
FA_Container fa;
|
||||
FA_Container fa_1f;
|
||||
FA_Container fa_2f;
|
||||
SubExprCompiled result;
|
||||
ParsingContext pctx(codeset_collection);
|
||||
parseBody(*this, fa, result, pctx);
|
||||
/* CLion gone crazy here. It thinks error is always false (It doesn't know about such thing as macros) */
|
||||
if (error)
|
||||
return;
|
||||
|
||||
FA_NodeOfMatch* matcher = fa.makeMatch();
|
||||
if (!result.start){
|
||||
fa.start = matcher;
|
||||
} else {
|
||||
fa.start = result.start;
|
||||
for (FA_Node** ending: result.ends)
|
||||
reattach_fa_node_edge(ending, matcher);
|
||||
}
|
||||
fa.start->refs++;
|
||||
|
||||
// show_fa_with_sxiv_after_dot(fa, ktr, pctx.priority_table); // todo debug
|
||||
|
||||
REGEX_IS024_FA_FirstStageFixInfo info1 = first_stage_fix_fa(fa, fa_1f);
|
||||
|
||||
// show_fa_with_sxiv_after_dot(fa_1f, ktr, pctx.priority_table); // todo debug
|
||||
|
||||
if (pctx.dfa_cmd_activated) {
|
||||
int det_err;
|
||||
int had_to_fork;
|
||||
try_determinize_fa(fa_1f, pctx.priority_table, free_selarr_tai, info1, fa_2f, det_err, had_to_fork);
|
||||
if (det_err < 0 && !pctx.dfa_cmd_nonimportant) {
|
||||
report(*this, "Unable to determinize dfa");
|
||||
return;
|
||||
}
|
||||
if (pctx.dfa_cmd_unforgiving && had_to_fork < 0) {
|
||||
report(*this, "Attempt to determinize dfa was not good enough");
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
regular_second_stage_fix(fa_1f, fa_2f, info1);
|
||||
}
|
||||
|
||||
// show_fa_with_sxiv_after_dot(fa_2f, ktr, pctx.priority_table); // todo debug
|
||||
|
||||
int compilation_error;
|
||||
compile_fa_to_regexis024_bytecode(compiled_program, fa_2f, pctx.priority_table, free_selarr_tai, info1, compilation_error);
|
||||
if (compilation_error) {
|
||||
report(*this, "Failed to compile graph representation to bytecode representation");
|
||||
return;
|
||||
}
|
||||
}
|
34
src/libregexis024sol/expr_compiler.h
Normal file
34
src/libregexis024sol/expr_compiler.h
Normal file
@ -0,0 +1,34 @@
|
||||
#ifndef LIBREGEXIS024_EXPR_COMPILER_H
|
||||
#define LIBREGEXIS024_EXPR_COMPILER_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
// todo: SUPER HIGHT PRIORITY: MOVE all this spaces digits variable_constituents junk out of this class
|
||||
// todo: also PLEEEASE, write static before literally nearly every single one little stupid function in this library
|
||||
#include <libregexis024sol/part_of_expr_that_tracks.h>
|
||||
|
||||
struct REGEX_IS024_MeaningContext{
|
||||
size_t input_size;
|
||||
const uint8_t* input;
|
||||
|
||||
bool error = false;
|
||||
std::string error_msg;
|
||||
|
||||
size_t pos = 0;
|
||||
|
||||
bool have_comment_tail = false;
|
||||
size_t comment_tail_start;
|
||||
std::vector<uint8_t> compiled_program;
|
||||
|
||||
KnownTrackingTools ktr;
|
||||
|
||||
uint16_t free_selarr_tai = 0;
|
||||
uint16_t free_colarr_tai = 0;
|
||||
|
||||
REGEX_IS024_MeaningContext(size_t inputSize, const char *input);
|
||||
};
|
||||
|
||||
#endif //LIBREGEXIS024_EXPR_COMPILER_H
|
@ -0,0 +1,34 @@
|
||||
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
|
||||
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
|
||||
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
|
||||
|
||||
const char* header_command_dfa_names[] = {"dfa", "determinize", NULL};
|
||||
|
||||
const char* header_command_select_names[] = {"s", "select", "selarr", "selectional", NULL};
|
||||
|
||||
bool is_header_cmd(const Command &cmd) {
|
||||
return cmd.tilda || is_header_dfa_cmd(cmd), is_header_dfa_cmd(cmd);
|
||||
}
|
||||
|
||||
bool is_header_dfa_cmd(const Command &cmd) {
|
||||
return is_string_in_stringset(cmd.name.c_str(), header_command_dfa_names);
|
||||
}
|
||||
|
||||
bool is_header_select_cmd(const Command &cmd) {
|
||||
return is_string_in_stringset(cmd.name.c_str(), header_command_select_names);
|
||||
}
|
||||
|
||||
void int_parse_with_limit_concern(const std::string &str, REGEX_IS024_MeaningContext &ctx, size_t &res, int lim) {
|
||||
res = 0;
|
||||
for (char ch: str){
|
||||
if (!('0' <= ch && ch <= '9'))
|
||||
aux_THROW("bad integer argument");
|
||||
res = res * 10 + (ch - '0');
|
||||
if (res > (size_t)lim)
|
||||
aux_THROW("integer is too big");
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
/* Internal use only */
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H
|
||||
|
||||
#include <libregexis024sol/special_terminals.h>
|
||||
|
||||
bool is_header_cmd(const Command& cmd);
|
||||
bool is_header_dfa_cmd(const Command& cmd);
|
||||
bool is_header_select_cmd(const Command& cmd);
|
||||
void int_parse_with_limit_concern(const std::string& str, REGEX_IS024_MeaningContext &ctx, size_t& res, int lim);
|
||||
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H
|
222
src/libregexis024sol/expr_parse_functions/ep_sequence.cpp
Normal file
222
src/libregexis024sol/expr_parse_functions/ep_sequence.cpp
Normal file
@ -0,0 +1,222 @@
|
||||
#include <libregexis024sol/expr_parse_functions/epf.h>
|
||||
#include <assert.h>
|
||||
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024sol/special_terminals.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024sol/square_bracket_expression.h>
|
||||
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
|
||||
#include <libregexis024fa/misc_fa_funcs.h>
|
||||
|
||||
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
|
||||
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
|
||||
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
|
||||
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
|
||||
|
||||
/* **************************** Sequence */
|
||||
|
||||
void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) {
|
||||
assert(readChar(ctx) == U'\\');
|
||||
int32_t leader = peep(ctx); aux_ERROR_CHECK;
|
||||
if (leader == U'b'){
|
||||
FA_NodeOfForking* n1 = fa.makeForking();
|
||||
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents));
|
||||
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
|
||||
reattach_nxt_node(n1a, n2a);
|
||||
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents);
|
||||
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
|
||||
reattach_nxt_node(n1b, n2b);
|
||||
add_option_to_fork_node(n1, n1a);
|
||||
add_option_to_fork_node(n1, n1b);
|
||||
backPart.start = n1;
|
||||
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
|
||||
} else if (leader == U'B'){
|
||||
FA_NodeOfForking* n1 = fa.makeForking();
|
||||
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents);
|
||||
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
|
||||
reattach_nxt_node(n1a, n2a);
|
||||
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents));
|
||||
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
|
||||
reattach_nxt_node(n1b, n2b);
|
||||
add_option_to_fork_node(n1, n1a);
|
||||
add_option_to_fork_node(n1, n1b);
|
||||
backPart.start = n1;
|
||||
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
|
||||
} else if (leader == U'<'){
|
||||
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents));
|
||||
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents);
|
||||
reattach_nxt_node(n1, n2);
|
||||
backPart.start = n1;
|
||||
backPart.ends = {&(n2->nxt_node)};
|
||||
} else if (leader == U'>'){
|
||||
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents);
|
||||
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents));
|
||||
reattach_nxt_node(n1, n2);
|
||||
backPart.start = n1;
|
||||
backPart.ends = {&(n2->nxt_node)};
|
||||
} else {
|
||||
bool ret_is_multicode; codeset_t res_codeset;
|
||||
backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset);
|
||||
backPart = subexpr_charset_reading_filter(res_codeset, fa);
|
||||
return; // To avoid reading leader again (it gets read in the end)
|
||||
}
|
||||
readChar(ctx);
|
||||
}
|
||||
|
||||
void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx,
|
||||
SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){
|
||||
if (min_allowed > max_allowed)
|
||||
aux_THROW("repeat operation: min > max");
|
||||
if (min_allowed > REGEXIS024_MAX_REPEAT)
|
||||
aux_THROW("minimum repeat factor is too high");
|
||||
if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty)
|
||||
aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное "
|
||||
"выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: "
|
||||
"По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены.");
|
||||
apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed);
|
||||
}
|
||||
|
||||
void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector<SubExprCompiled>& parts,
|
||||
const Command& cmd){
|
||||
if (parts.empty())
|
||||
aux_THROW("no subexpression before !repeat command");
|
||||
if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) {
|
||||
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK;
|
||||
} else if (cmd.arguments.size() == 1){
|
||||
size_t mm;
|
||||
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK;
|
||||
repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK;
|
||||
} else if (cmd.arguments.size() > 2){
|
||||
aux_THROW("too many arguments in !repeat command");
|
||||
} else {
|
||||
size_t min_allowed, max_allowed;
|
||||
if (cmd.arguments[0].is_empty){
|
||||
min_allowed = 0;
|
||||
} else {
|
||||
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT);
|
||||
aux_ERROR_CHECK;
|
||||
}
|
||||
if (cmd.arguments[1].is_empty){
|
||||
max_allowed = REGEXIS024_MAX_REPEAT + 1;
|
||||
} else {
|
||||
int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT);
|
||||
aux_ERROR_CHECK;
|
||||
}
|
||||
if (min_allowed > max_allowed)
|
||||
aux_THROW("!repeat: min > max");
|
||||
repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
|
||||
while (true) {
|
||||
int32_t fst = peep(ctx);
|
||||
call_ERROR_CHECK;
|
||||
if (fst == U'!') {
|
||||
Command cmdBuf;
|
||||
size_t before_cmd = ctx.pos;
|
||||
cmdBuf = command_expr_parse(ctx);
|
||||
call_ERROR_CHECK;
|
||||
if (is_header_cmd(cmdBuf)){
|
||||
ctx.pos = before_cmd;
|
||||
break;
|
||||
} else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){
|
||||
repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK;
|
||||
} else if (is_command_for_charset(cmdBuf)){
|
||||
codeset_t cs;
|
||||
interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK;
|
||||
parts.push_back(subexpr_charset_reading_filter(cs, fa));
|
||||
} else {
|
||||
call_THROW("unknown command");
|
||||
}
|
||||
} else if (fst == U'\\') {
|
||||
parts.emplace_back();
|
||||
in_case_of_backslash(ctx, pctx.cc, fa, parts.back());
|
||||
call_ERROR_CHECK;
|
||||
} else if (fst == U'^'){
|
||||
readChar(ctx);
|
||||
parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n'))));
|
||||
} else if (fst == U'$'){
|
||||
readChar(ctx);
|
||||
parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n'))));
|
||||
} else if (fst == U'*'){
|
||||
#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx);
|
||||
vibe_check("*")
|
||||
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
|
||||
} else if (fst == U'+'){
|
||||
vibe_check("+")
|
||||
repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
|
||||
} else if (fst == U'?'){
|
||||
vibe_check("?")
|
||||
repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK;
|
||||
#undef vibe_check
|
||||
} else if (fst == U'#'){
|
||||
readChar(ctx);
|
||||
std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
|
||||
if (name.empty())
|
||||
call_THROW("No name provided after #");
|
||||
if (ctx.ktr.track_names.count(name) == 0){
|
||||
ctx.ktr.track_names[name] = static_cast<int64_t>(ctx.ktr.retrieval_info.size());
|
||||
ctx.ktr.retrieval_info.emplace_back();
|
||||
}
|
||||
int64_t id = ctx.ktr.track_names[name];
|
||||
int32_t typeDet = peep(ctx);
|
||||
if (typeDet == U'('){
|
||||
ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK;
|
||||
parts.emplace_back();
|
||||
return std::make_unique<BracketLvl_ParseCall>(parts.back(), id);
|
||||
} else if (typeDet == U':'){
|
||||
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK;
|
||||
readChar(ctx);
|
||||
std::string value_str = tryRead_REGEX024_name(ctx);
|
||||
size_t value;
|
||||
int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX);
|
||||
int32_t cl = peep(ctx);
|
||||
if (cl != U';')
|
||||
call_THROW("Missing ; after dot track unit operator");
|
||||
readChar(ctx);
|
||||
if (ctx.ktr.retrieval_info[id].stored_in_sa)
|
||||
parts.emplace_back(subexpression_from_path(
|
||||
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM,
|
||||
ctx.ktr.retrieval_info[id].selarr_first, value)));
|
||||
if (ctx.ktr.retrieval_info[id].stored_in_ca)
|
||||
parts.emplace_back(subexpression_from_path(
|
||||
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM,
|
||||
ctx.ktr.retrieval_info[id].colarr_first, value)));
|
||||
} else if (typeDet == U';'){
|
||||
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK;
|
||||
readChar(ctx);
|
||||
if (ctx.ktr.retrieval_info[id].stored_in_sa)
|
||||
parts.emplace_back(subexpression_from_path(
|
||||
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS,
|
||||
ctx.ktr.retrieval_info[id].selarr_first)));
|
||||
if (ctx.ktr.retrieval_info[id].stored_in_ca)
|
||||
parts.emplace_back(subexpression_from_path(
|
||||
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS,
|
||||
ctx.ktr.retrieval_info[id].colarr_first)));
|
||||
} else
|
||||
call_THROW("Missing ; or ( in the beginning of tracking unit");
|
||||
} else if (fst == U'(') {
|
||||
parts.emplace_back();
|
||||
return std::make_unique<BracketLvl_ParseCall>(parts.back(), -1);
|
||||
} else if (fst == U'[') {
|
||||
codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK;
|
||||
parts.push_back(subexpr_charset_reading_filter(filter, fa));
|
||||
} else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){
|
||||
readChar(ctx);
|
||||
parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (SubExprCompiled& part: parts)
|
||||
result = join(result, part);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
|
||||
// This is possible only if I received a bracket expression
|
||||
return firstTime(ctx, pctx, fa);
|
||||
}
|
74
src/libregexis024sol/expr_parse_functions/epf.h
Normal file
74
src/libregexis024sol/expr_parse_functions/epf.h
Normal file
@ -0,0 +1,74 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H
|
||||
/* For internal usage only */
|
||||
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024sol/common_codesets.h>
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
#include <memory>
|
||||
#include <libregexis024sol/subexpr_fa_transformed.h>
|
||||
#include <assert.h>
|
||||
#include <libregexis024fa/selarr_priority_table.h>
|
||||
|
||||
struct ParsingContext{
|
||||
/* Those subexpressions, that are tracket by s`a are forbidden from nesting inside themselves */
|
||||
std::vector<bool> is_inside_of_these_sa_subexpressions;
|
||||
bool select_cmd_encountered = false;
|
||||
RegexPriorityTable priority_table;
|
||||
bool dfa_cmd_activated = false;
|
||||
/* Completely failing to build dfa with this flag on will result in no error */
|
||||
bool dfa_cmd_nonimportant = false;
|
||||
/* With this flag, your dfa should be absolutely pure, no forks are allowed. */
|
||||
bool dfa_cmd_unforgiving = false;
|
||||
|
||||
/* Reference to active cc set (actually, there is only one cc, but who cares, I placed
|
||||
* it here to lower the number of arguments in ParseCall methods, again WHO CARES?) */
|
||||
const CommonCodesets& cc;
|
||||
explicit ParsingContext(const CommonCodesets& cc_): cc(cc_){}
|
||||
};
|
||||
|
||||
typedef REGEX_IS024_MeaningContext ctx_t;
|
||||
struct ParseCall;
|
||||
typedef std::unique_ptr<ParseCall> chekushka;
|
||||
struct ParseCall{
|
||||
SubExprCompiled& result;
|
||||
explicit ParseCall(SubExprCompiled &result) : result(result) {}
|
||||
virtual ~ParseCall() = default;
|
||||
virtual chekushka afterReceive(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); }
|
||||
virtual chekushka firstTime(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); }
|
||||
};
|
||||
|
||||
struct TopLvl_ParseCall: public ParseCall{
|
||||
explicit TopLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {}
|
||||
chekushka afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override;
|
||||
chekushka firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override;
|
||||
};
|
||||
|
||||
struct BracketLvl_ParseCall: public ParseCall{
|
||||
/* -1 if this is a normal bracket expression. Otherwise, it is an index in ctx.retrieval_info vector */
|
||||
int64_t namedSubexpressionId;
|
||||
SubExprCompiled tmp_ret_buff;
|
||||
explicit BracketLvl_ParseCall(SubExprCompiled& result, int64_t namedSubexpressionId) :
|
||||
ParseCall(result), namedSubexpressionId(namedSubexpressionId) {}
|
||||
chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override;
|
||||
chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override;
|
||||
};
|
||||
|
||||
struct ForkLvl_ParseCall: public ParseCall{
|
||||
std::vector<SubExprCompiled> options;
|
||||
explicit ForkLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {}
|
||||
chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
|
||||
chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
|
||||
};
|
||||
|
||||
struct Sequence_ParseCall: public ParseCall{
|
||||
std::vector<SubExprCompiled> parts;
|
||||
explicit Sequence_ParseCall(SubExprCompiled &result) :ParseCall(result) {}
|
||||
chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
|
||||
chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
|
||||
};
|
||||
|
||||
/* Some auxilary functions */
|
||||
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H
|
38
src/libregexis024sol/expr_parse_functions/tracking_units.cpp
Normal file
38
src/libregexis024sol/expr_parse_functions/tracking_units.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
|
||||
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
|
||||
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
|
||||
|
||||
|
||||
void for_one_type(REGEX_IS024_MeaningContext &ctx, uint16_t& free_ARR_tai, int& ARR_first, int& ARR_second,
|
||||
const std::string& ARR_NAME, tracking_var_type type){
|
||||
#define check_is_available() if (free_ARR_tai == UINT16_MAX) { \
|
||||
report(ctx, ("regex: " + ARR_NAME + ": key namespace overflow").c_str()); return;}
|
||||
check_is_available()
|
||||
ARR_first = free_ARR_tai++;
|
||||
if (type == tracking_var_types::range){
|
||||
check_is_available()
|
||||
ARR_second = free_ARR_tai++;
|
||||
}
|
||||
}
|
||||
|
||||
void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type) {
|
||||
size_t id = ctx.ktr.track_names[name];
|
||||
/* Size of this verctor won't be changed. THis is a safe reference */
|
||||
SubtrackingNameInfo& info = ctx.ktr.retrieval_info[id];
|
||||
if (!info.discovered){
|
||||
info.type = type;
|
||||
if (info.stored_in_ca) {
|
||||
for_one_type(ctx, ctx.free_colarr_tai, info.colarr_first, info.colarr_second, "collection array", type);
|
||||
aux_ERROR_CHECK;
|
||||
}
|
||||
if (info.stored_in_sa) {
|
||||
for_one_type(ctx, ctx.free_selarr_tai, info.selarr_first, info.selarr_second, "selection array", type);
|
||||
aux_ERROR_CHECK;
|
||||
}
|
||||
info.discovered = true;
|
||||
} else if (info.type != type){
|
||||
aux_THROW("tracking tool unit type mismatch");
|
||||
}
|
||||
}
|
10
src/libregexis024sol/expr_parse_functions/tracking_units.h
Normal file
10
src/libregexis024sol/expr_parse_functions/tracking_units.h
Normal file
@ -0,0 +1,10 @@
|
||||
/* For internal use only */
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H
|
||||
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
|
||||
void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type);
|
||||
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H
|
2
src/libregexis024sol/part_of_expr_that_tracks.cpp
Normal file
2
src/libregexis024sol/part_of_expr_that_tracks.cpp
Normal file
@ -0,0 +1,2 @@
|
||||
// #include <libregexis024sol/part_of_expr_that_tracks.h>
|
||||
|
31
src/libregexis024sol/part_of_expr_that_tracks.h
Normal file
31
src/libregexis024sol/part_of_expr_that_tracks.h
Normal file
@ -0,0 +1,31 @@
|
||||
#ifndef PART_OF_EXPR_THAT_TRACKS_H
|
||||
#define PART_OF_EXPR_THAT_TRACKS_H
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <libregexis024fa/tracking_variables.h>
|
||||
|
||||
struct SubtrackingNameInfo{
|
||||
bool stored_in_ca = true;
|
||||
bool stored_in_sa = false;
|
||||
|
||||
bool discovered = false;
|
||||
tracking_var_type type;
|
||||
/* These fields will be -1 if unused */
|
||||
int colarr_first = -1;
|
||||
int colarr_second = -1;
|
||||
|
||||
bool used_in_sifting = false;
|
||||
bool minimizing = false;
|
||||
int selarr_first = -1;
|
||||
int selarr_second = -1;
|
||||
};
|
||||
|
||||
struct KnownTrackingTools {
|
||||
std::map<std::string, int64_t> track_names;
|
||||
std::vector<SubtrackingNameInfo> retrieval_info;
|
||||
};
|
||||
|
||||
|
||||
#endif //PART_OF_EXPR_THAT_TRACKS_H
|
55
src/libregexis024sol/sol_misc_base.cpp
Normal file
55
src/libregexis024sol/sol_misc_base.cpp
Normal file
@ -0,0 +1,55 @@
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
|
||||
void report(REGEX_IS024_MeaningContext &ctx, const char *error) {
|
||||
if (!ctx.error){
|
||||
ctx.error = true;
|
||||
ctx.error_msg = error;
|
||||
}
|
||||
}
|
||||
|
||||
bool isEnd(REGEX_IS024_MeaningContext &ctx) {
|
||||
return ctx.pos == ctx.input_size;
|
||||
}
|
||||
|
||||
int32_t peep(REGEX_IS024_MeaningContext &ctx) {
|
||||
// printf("pos = %lu\n", ctx.pos);
|
||||
if (isEnd(ctx))
|
||||
return -1; // This is probably the only place where getting negative return does not generate error
|
||||
int32_t cp; size_t sz;
|
||||
utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size);
|
||||
if (cp < 0)
|
||||
report(ctx, "encoding error");
|
||||
return cp;
|
||||
}
|
||||
|
||||
int32_t readChar(REGEX_IS024_MeaningContext &ctx) {
|
||||
// printf("READ pos = %lu\n", ctx.pos);
|
||||
int32_t cp; size_t sz;
|
||||
utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size);
|
||||
if (cp >= 0)
|
||||
ctx.pos += sz;
|
||||
else
|
||||
report(ctx, "bruh what?? How this even happened");
|
||||
return cp;
|
||||
}
|
||||
|
||||
bool is_REGEX024_nameConstituent(int32_t ch) {
|
||||
return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
|
||||
}
|
||||
|
||||
std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext &ctx) {
|
||||
std::string res;
|
||||
while (true){
|
||||
int32_t ch = peep(ctx);
|
||||
if (is_REGEX024_nameConstituent(ch)){
|
||||
res += (char)ch;
|
||||
readChar(ctx);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
20
src/libregexis024sol/sol_misc_base.h
Normal file
20
src/libregexis024sol/sol_misc_base.h
Normal file
@ -0,0 +1,20 @@
|
||||
/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H
|
||||
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <string>
|
||||
|
||||
void report(REGEX_IS024_MeaningContext& ctx, const char* error);
|
||||
|
||||
bool isEnd(REGEX_IS024_MeaningContext& ctx);
|
||||
int32_t peep(REGEX_IS024_MeaningContext& ctx);
|
||||
int32_t readChar(REGEX_IS024_MeaningContext& ctx);
|
||||
|
||||
|
||||
bool is_REGEX024_nameConstituent(int32_t ch);
|
||||
/* Name in my library consists of [0-9a-zA-Z]. If the first peeped letter is not name constituent,
|
||||
* empty string is returned */
|
||||
std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext& ctx);
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H
|
36
src/libregexis024sol/special_terminals.h
Normal file
36
src/libregexis024sol/special_terminals.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H
|
||||
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024sol/common_codesets.h>
|
||||
|
||||
/* This option of backslash usage should be checked last.
|
||||
* Function can generate error. Always check the error first */
|
||||
void
|
||||
backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc,
|
||||
bool& ret_is_multicode, codeset_t& ret_set);
|
||||
|
||||
struct CommandEntity;
|
||||
struct Command;
|
||||
struct CommandArgument;
|
||||
|
||||
struct CommandEntity{
|
||||
std::string name;
|
||||
std::vector<CommandArgument> arguments;
|
||||
};
|
||||
|
||||
struct CommandArgument: CommandEntity{
|
||||
bool is_empty = true;
|
||||
};
|
||||
|
||||
struct Command: CommandEntity{
|
||||
bool tilda = false;
|
||||
};
|
||||
|
||||
/* Zlaya sobaka. Kidaet oshibki v context */
|
||||
Command command_expr_parse(REGEX_IS024_MeaningContext& ctx);
|
||||
bool is_command_for_charset(const Command& cmd);
|
||||
void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command& cmd, codeset_t& ret);
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H
|
189
src/libregexis024sol/square_bracket_expression.cpp
Normal file
189
src/libregexis024sol/square_bracket_expression.cpp
Normal file
@ -0,0 +1,189 @@
|
||||
#include <libregexis024sol/square_bracket_expression.h>
|
||||
#include <libregexis024sol/sol_misc_base.h>
|
||||
#include <libregexis024sol/special_terminals.h>
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <assert.h>
|
||||
|
||||
/* Can allow backslash (later should check that backslash expression is not multicharar or empty */
|
||||
bool soundsLikeCharOrRangeStart(int32_t peeped) {
|
||||
return peeped >= 0 && (peeped != U'[' && peeped != U']' && peeped != U'!' && \
|
||||
peeped != '^' && peeped != '&' && peeped != '-');
|
||||
}
|
||||
|
||||
typedef REGEX_IS024_MeaningContext ctx_t;
|
||||
|
||||
struct ParseCall;
|
||||
typedef std::shared_ptr<ParseCall> chekushka;
|
||||
|
||||
struct ParseCall{
|
||||
codeset_t& result;
|
||||
|
||||
explicit ParseCall(codeset_t &result) : result(result) {}
|
||||
virtual ~ParseCall() = default;
|
||||
virtual chekushka afterReceive(ctx_t& ctx, const CommonCodesets& cc) { assert(false); }
|
||||
virtual chekushka firstTime(ctx_t& ctx, const CommonCodesets& cc) { assert(false); }
|
||||
};
|
||||
|
||||
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
|
||||
#define call_THROW(str) do { report(ctx, "square bracket expression: " str); return NULL; } while (0)
|
||||
|
||||
/* [...] */
|
||||
struct ZeroLvl_ParseCall: public ParseCall{
|
||||
explicit ZeroLvl_ParseCall(codeset_t &result) : ParseCall(result) {}
|
||||
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
};
|
||||
|
||||
/* ...&...&... */
|
||||
struct FirstLvl_ParseCall: public ParseCall{
|
||||
codeset_t ret_buf_for_new;
|
||||
bool got_one = false;
|
||||
explicit FirstLvl_ParseCall(codeset_t& result) : ParseCall(result) {}
|
||||
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
};
|
||||
|
||||
/* ab[]vgd[]eyo[]zhz */
|
||||
struct SecondLvl_ParseCall: public ParseCall{
|
||||
codeset_t ret_buf_for_new;
|
||||
explicit SecondLvl_ParseCall(codeset_t& result) : ParseCall(result) {}
|
||||
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
};
|
||||
|
||||
/* ^... */
|
||||
struct CircumflexLvl_ParseCall: public ParseCall{
|
||||
codeset_t ret_buf_for_new;
|
||||
explicit CircumflexLvl_ParseCall(codeset_t& result) : ParseCall(result) {}
|
||||
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
|
||||
};
|
||||
|
||||
/* ********* ZeroLvl_ParseCall ********** */
|
||||
|
||||
chekushka ZeroLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
assert(readChar(ctx) == U'[');
|
||||
return std::make_shared<FirstLvl_ParseCall>(result);
|
||||
}
|
||||
|
||||
chekushka ZeroLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
if (peep(ctx) != U']')
|
||||
call_THROW("lvl 0: missing ]");
|
||||
readChar(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ********* FirstLvl_ParseCall ********** */
|
||||
|
||||
chekushka FirstLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
return std::make_shared<SecondLvl_ParseCall>(result);
|
||||
}
|
||||
|
||||
chekushka FirstLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
if (got_one)
|
||||
result = intersect_sets(result, ret_buf_for_new);
|
||||
else
|
||||
got_one = true;
|
||||
if (peep(ctx) == U'&'){
|
||||
readChar(ctx);
|
||||
return std::make_shared<SecondLvl_ParseCall>(ret_buf_for_new);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* ********* SecondLvl_ParseCall ********** */
|
||||
|
||||
chekushka SecondLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
repeat:
|
||||
int32_t ch = peep(ctx); call_ERROR_CHECK;
|
||||
if (ch == U'^'){
|
||||
return std::make_shared<CircumflexLvl_ParseCall>(ret_buf_for_new);
|
||||
} else if (ch == U'!'){
|
||||
Command cmd = command_expr_parse(ctx); call_ERROR_CHECK;
|
||||
if (!is_command_for_charset(cmd))
|
||||
call_THROW("second lvl: illegal command");
|
||||
interpret_command_as_charset_giving(cc, cmd, ret_buf_for_new);
|
||||
result = merge_sets(result, ret_buf_for_new);
|
||||
goto repeat;
|
||||
} else if (ch == U'['){
|
||||
return std::make_shared<ZeroLvl_ParseCall>(ret_buf_for_new);
|
||||
} else if (soundsLikeCharOrRangeStart(ch)){
|
||||
readChar(ctx);
|
||||
bool bs_multicode;
|
||||
codeset_t bs_stuff;
|
||||
|
||||
if (ch == '\\'){
|
||||
backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff);
|
||||
if (bs_multicode){
|
||||
result = merge_sets(result, bs_stuff);
|
||||
goto repeat;
|
||||
} else {
|
||||
ret_buf_for_new = codeset_of_one_char(bs_stuff[0].first);
|
||||
}
|
||||
} else {
|
||||
ret_buf_for_new = codeset_of_one_char(ch);
|
||||
}
|
||||
int32_t mCh = peep(ctx); call_ERROR_CHECK;
|
||||
if (mCh == U'-'){
|
||||
readChar(ctx);
|
||||
int32_t scnd = peep(ctx); call_ERROR_CHECK;
|
||||
readChar(ctx);
|
||||
if (scnd == U'\\'){
|
||||
backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff);
|
||||
if (bs_multicode)
|
||||
call_THROW("second lvl: char range: bad escape expression after hyphen");
|
||||
ret_buf_for_new[0].second = bs_stuff[0].first;
|
||||
} else if (soundsLikeCharOrRangeStart(scnd)){
|
||||
ret_buf_for_new[0].second = (uint32_t)scnd;
|
||||
} else {
|
||||
call_THROW("second lvl: char range: bad value after hyphen");
|
||||
}
|
||||
if (ret_buf_for_new[0].second < ret_buf_for_new[0].first)
|
||||
call_THROW("second: lvl: char range: invalid range");
|
||||
}
|
||||
result = merge_sets(result, ret_buf_for_new);
|
||||
goto repeat;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
chekushka SecondLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
result = merge_sets(result, ret_buf_for_new);
|
||||
return firstTime(ctx, cc);
|
||||
}
|
||||
|
||||
/* ********* CircumflexLvl_ParseCall ********* */
|
||||
|
||||
chekushka CircumflexLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
assert(readChar(ctx) == U'^');
|
||||
return std::make_shared<FirstLvl_ParseCall>(ret_buf_for_new);
|
||||
}
|
||||
|
||||
chekushka CircumflexLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
|
||||
result = invert_set(ret_buf_for_new);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/* Aaaaaaaaand... The function we have all been waiting for so long! */
|
||||
codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc) {
|
||||
std::vector<std::shared_ptr<ParseCall>> callStack;
|
||||
codeset_t res;
|
||||
callStack.push_back(std::make_shared<ZeroLvl_ParseCall>(res));
|
||||
bool first_time = true;
|
||||
while (!callStack.empty()){
|
||||
if (ctx.error)
|
||||
return {};
|
||||
auto nxt = first_time ? callStack.back()->firstTime(ctx, cc) : callStack.back()->afterReceive(ctx, cc);
|
||||
if (nxt){
|
||||
callStack.push_back(nxt);
|
||||
first_time = true;
|
||||
} else {
|
||||
callStack.pop_back();
|
||||
first_time = false;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
10
src/libregexis024sol/square_bracket_expression.h
Normal file
10
src/libregexis024sol/square_bracket_expression.h
Normal file
@ -0,0 +1,10 @@
|
||||
/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H
|
||||
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024sol/common_codesets.h>
|
||||
|
||||
codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc);
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H
|
184
src/libregexis024sol/subexpr_fa_transformed.cpp
Normal file
184
src/libregexis024sol/subexpr_fa_transformed.cpp
Normal file
@ -0,0 +1,184 @@
|
||||
#include <libregexis024sol/subexpr_fa_transformed.h>
|
||||
#include <libregexis024fa/misc_fa_funcs.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
SubExprCompiled subexpr_charset_reading_filter(const codeset_t &codeset, FA_Container &fa) {
|
||||
return subexpression_from_path(fa.makeOneCharRead(codeset, false));
|
||||
}
|
||||
|
||||
SubExprCompiled join(const SubExprCompiled &A, const SubExprCompiled &B) {
|
||||
if (!A.start)
|
||||
return B;
|
||||
if (!B.start)
|
||||
return A;
|
||||
SubExprCompiled res;
|
||||
res.start = A.start;
|
||||
for (FA_Node** ptrToptr : A.ends)
|
||||
reattach_fa_node_edge(ptrToptr, B.start);
|
||||
res.ends = B.ends;
|
||||
res.can_be_empty = A.can_be_empty && B.can_be_empty;
|
||||
return res;
|
||||
}
|
||||
|
||||
SubExprCompiled subexpression_from_path(FA_NodePathPart *node) {
|
||||
SubExprCompiled res;
|
||||
res.start = node;
|
||||
res.ends.push_back(&(node->nxt_node));
|
||||
/* There is only one char reading path node type */
|
||||
res.can_be_empty = (node->type != one_char_read);
|
||||
return res;
|
||||
}
|
||||
|
||||
SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa) {
|
||||
SubExprCompiled res;
|
||||
if (!source.start)
|
||||
return res;
|
||||
|
||||
struct Marked{
|
||||
FA_Node *original = NULL, *clone = NULL;
|
||||
explicit Marked(FA_Node *original) : original(original) {}
|
||||
};
|
||||
std::vector<Marked> searched;
|
||||
searched.push_back(Marked(source.start));
|
||||
source.start->search_mark = 0;
|
||||
|
||||
for (size_t done = 0; done < searched.size(); done++){
|
||||
FA_Node& v = *searched[done].original;
|
||||
searched[done].clone = copy_fa_node(v, fa);
|
||||
for (FA_Node **nxtN: searched[done].clone->get_all_transitions()){
|
||||
if (!(*nxtN))
|
||||
res.ends.push_back(nxtN);
|
||||
else if ((**nxtN).search_mark < 0){
|
||||
(**nxtN).search_mark = (int64_t)searched.size();
|
||||
searched.emplace_back(*nxtN);
|
||||
}
|
||||
}
|
||||
}
|
||||
res.start = searched[0].clone;
|
||||
for (Marked& mrkd: searched){
|
||||
for (FA_Node **nxtN: mrkd.clone->get_all_transitions()){
|
||||
if (*nxtN){
|
||||
assert((**nxtN).search_mark >= 0);
|
||||
Marked& proc_nxt = searched[(**nxtN).search_mark];
|
||||
reattach_fa_node_edge(nxtN, proc_nxt.clone);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (Marked& mrkd: searched)
|
||||
mrkd.original->search_mark = -1;
|
||||
return res;
|
||||
}
|
||||
|
||||
void reattach_all_ends_to_one_node(SubExprCompiled& patient, FA_Node* node){
|
||||
assert(node);
|
||||
assert(patient.start);
|
||||
for (FA_Node** end: patient.ends){
|
||||
assert(!(*end));
|
||||
printf("DEBUG %lu->->->->->%lu\n", patient.start->nodeId, node->nodeId);
|
||||
reattach_fa_node_edge(end, node);
|
||||
}
|
||||
}
|
||||
|
||||
void apply_repeat_to_subexpression(SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed) {
|
||||
assert(min_allowed <= max_allowed && min_allowed <= REGEXIS024_MAX_REPEAT);
|
||||
if (!patient.start)
|
||||
return;
|
||||
bool infinite_repeat = max_allowed > REGEXIS024_MAX_REPEAT;
|
||||
if (min_allowed == 0 && max_allowed == 0){
|
||||
patient = {};
|
||||
} else if (min_allowed == 1 && max_allowed == 1){
|
||||
/* Chill */
|
||||
} else if (min_allowed == 0 && infinite_repeat){
|
||||
FA_NodeOfForking* fn = fa.makeForking();
|
||||
add_option_to_fork_node(fn, patient.start);
|
||||
for (FA_Node** old_end: patient.ends)
|
||||
reattach_fa_node_edge(old_end, fn);
|
||||
add_option_to_fork_node(fn, NULL);
|
||||
patient.start = fn;
|
||||
patient.ends = {&(fn->nxt_options[1])};
|
||||
} else if (min_allowed == 1 && infinite_repeat) {
|
||||
FA_NodeOfForking* fn = fa.makeForking();
|
||||
reattach_all_ends_to_one_node(patient, fn);
|
||||
add_option_to_fork_node(fn, patient.start);
|
||||
add_option_to_fork_node(fn, NULL);
|
||||
patient.ends = {&(fn->nxt_options[1])};
|
||||
} else if (min_allowed == 0 && max_allowed == 1){
|
||||
FA_NodeOfForking* fn = fa.makeForking();
|
||||
add_option_to_fork_node(fn, patient.start);
|
||||
add_option_to_fork_node(fn, NULL);
|
||||
patient.start = fn;
|
||||
patient.ends.push_back(&(fn->nxt_options[1]));
|
||||
} else if (infinite_repeat) {
|
||||
std::vector<SubExprCompiled> Colon(min_allowed);
|
||||
Colon[0] = patient;
|
||||
for (size_t i = 1; i < min_allowed; i++)
|
||||
Colon[i] = RobertAngier(patient, fa);
|
||||
FA_NodeOfForking* fn = fa.makeForking();
|
||||
for (size_t i = 0; i + 1 < min_allowed; i++)
|
||||
reattach_all_ends_to_one_node(Colon[i], Colon[i + 1].start);
|
||||
reattach_all_ends_to_one_node(Colon[min_allowed - 1], fn);
|
||||
add_option_to_fork_node(fn, Colon[min_allowed - 1].start);
|
||||
add_option_to_fork_node(fn, NULL);
|
||||
/* patient.start is the same (the original is at Colon[0] */
|
||||
patient.ends = {&(fn->nxt_options[1])};
|
||||
} else {
|
||||
std::vector<SubExprCompiled> Avenue(max_allowed);
|
||||
Avenue[max_allowed - 1] = patient;
|
||||
for (size_t i = 0; i < max_allowed - 1; i++)
|
||||
Avenue[i] = RobertAngier(patient, fa);
|
||||
for (size_t i = 0; i + 1 < max_allowed; i++)
|
||||
reattach_all_ends_to_one_node(Avenue[i], Avenue[i + 1].start);
|
||||
FA_NodeOfForking* fn = fa.makeForking();
|
||||
if (min_allowed > 0){
|
||||
for (size_t i = 0; i <= max_allowed - min_allowed; i++)
|
||||
add_option_to_fork_node(fn, Avenue[i].start);
|
||||
} else {
|
||||
for (size_t i = 0; i < max_allowed; i++)
|
||||
add_option_to_fork_node(fn, Avenue[i].start);
|
||||
add_option_to_fork_node(fn, NULL);
|
||||
patient.ends.push_back(&(fn->nxt_options[max_allowed]));
|
||||
}
|
||||
patient.start = fn;
|
||||
/* patient.ends is the same (the original is Avenue.back()) */
|
||||
}
|
||||
if (min_allowed == 0)
|
||||
patient.can_be_empty = true;
|
||||
}
|
||||
|
||||
SubExprCompiled forkify(const std::vector<SubExprCompiled> &options, FA_Container& fa){
|
||||
SubExprCompiled result;
|
||||
size_t non_empty = 0;
|
||||
result.can_be_empty = false;
|
||||
for (const SubExprCompiled& opt: options){
|
||||
result.can_be_empty |= opt.can_be_empty;
|
||||
if (opt.start)
|
||||
non_empty++;
|
||||
}
|
||||
if (non_empty == 0){
|
||||
result.can_be_empty = true;
|
||||
return result;
|
||||
}
|
||||
if (non_empty == 1){
|
||||
for (const SubExprCompiled& opt: options)
|
||||
if (opt.start){
|
||||
result = opt;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
FA_NodeOfForking* n1 = fa.makeForking();
|
||||
result.start = n1;
|
||||
n1->nxt_options.reserve(non_empty);
|
||||
for (const SubExprCompiled& opt: options)
|
||||
if (opt.start){
|
||||
add_option_to_fork_node(n1, opt.start);
|
||||
for (FA_Node** end: opt.ends)
|
||||
result.ends.push_back(end);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void SubExprCompiled::assertDefault() {
|
||||
assert(!start && ends.empty() && can_be_empty);
|
||||
}
|
32
src/libregexis024sol/subexpr_fa_transformed.h
Normal file
32
src/libregexis024sol/subexpr_fa_transformed.h
Normal file
@ -0,0 +1,32 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H
|
||||
|
||||
#include <libregexis024fa/finite_automaton.h>
|
||||
|
||||
struct SubExprCompiled{
|
||||
FA_Node* start = NULL;
|
||||
/* After putting there values from neighbour vectors in nodes, these vectors must not change size */
|
||||
std::vector<FA_Node**> ends;
|
||||
bool can_be_empty = true;
|
||||
|
||||
void assertDefault();
|
||||
};
|
||||
|
||||
SubExprCompiled subexpr_charset_reading_filter(const codeset_t& codeset, FA_Container& fa);
|
||||
|
||||
SubExprCompiled join(const SubExprCompiled& A, const SubExprCompiled& B);
|
||||
|
||||
SubExprCompiled forkify(const std::vector<SubExprCompiled>& options, FA_Container& fa);
|
||||
|
||||
SubExprCompiled subexpression_from_path(FA_NodePathPart* node);
|
||||
|
||||
/* And then Robert Angier said `It's prestige time` and prestiged all over the place.
|
||||
* If you still don't get it, this function copies section of NFA of regexp */
|
||||
SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa);
|
||||
|
||||
#define REGEXIS024_MAX_REPEAT 64
|
||||
|
||||
/* pass REGEXIS024_MAX_REPEAT + 1 as max_allowed to allow infinite repeat */
|
||||
void apply_repeat_to_subexpression(SubExprCompiled& patient, FA_Container& fa, size_t min_allowed, size_t max_allowed);
|
||||
|
||||
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H
|
141
src/libregexis024test/byte_code_assembler.h
Normal file
141
src/libregexis024test/byte_code_assembler.h
Normal file
@ -0,0 +1,141 @@
|
||||
/* This file is used for testing purposes only. Do not copy this file to installation prefix.
|
||||
* This tehnique exploits C compiler capabilities to get regex024 assembler for free*/
|
||||
#ifndef LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H
|
||||
#define LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H
|
||||
|
||||
#include "vibe_check.h"
|
||||
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <stdio.h>
|
||||
|
||||
struct assembler_context_bookmark{
|
||||
regex_near_ptr_t pos_in_r024program;
|
||||
int LINE;
|
||||
};
|
||||
|
||||
struct pending_bookmark{
|
||||
/* Must fill this byte with pos of pos_in_r024program in assembler_context_bookmark
|
||||
* In a sense, this is a pointer to a NULL pointer that is yet to become normal kinda pointer */
|
||||
regex_near_ptr_t pos_in_r024program;
|
||||
const char* name;
|
||||
/* LINE of the reference is needed in case of error */
|
||||
int LINE;
|
||||
};
|
||||
|
||||
struct assembler_context{
|
||||
std::map<std::string, assembler_context_bookmark> bookmarks;
|
||||
std::vector<pending_bookmark> unresolved_references;
|
||||
std::vector<uint8_t> result;
|
||||
|
||||
void declare_bookmark(const char* name, int LINE_of_this){
|
||||
if (bookmarks.count(name)){
|
||||
fprintf(stderr, "Double bookmark '%s' definition in lines %d and %d\n", name, bookmarks[name].LINE, LINE_of_this);
|
||||
exit(1);
|
||||
}
|
||||
bookmarks[name] = {result.size(), LINE_of_this};
|
||||
}
|
||||
|
||||
void resolve_references(){
|
||||
for (pending_bookmark& br: unresolved_references){
|
||||
if (bookmarks.count(br.name) == 0){
|
||||
fprintf(stderr, "Unknown bookmark '%s' is requested on line %d\n", br.name, br.LINE);
|
||||
exit(1);
|
||||
}
|
||||
/* pending bookmerk requests should be added only with beg_for_bookmark method,
|
||||
* or else SEGFAULT will be your frequent guest */
|
||||
*reinterpret_cast<regex_near_ptr_t *>(&result[br.pos_in_r024program]) = bookmarks[br.name].pos_in_r024program;
|
||||
}
|
||||
}
|
||||
|
||||
void put_byte(uint8_t x){
|
||||
result.push_back(x);
|
||||
}
|
||||
|
||||
void put_word(uint16_t x){
|
||||
put_byte(x & UINT8_MAX);
|
||||
put_byte(x >> 8);
|
||||
}
|
||||
|
||||
void put_doubleword(uint32_t x){
|
||||
put_word(x & UINT16_MAX);
|
||||
put_word(x >> 16);
|
||||
}
|
||||
|
||||
void put_quadword(uint64_t x){
|
||||
put_doubleword(x & UINT32_MAX);
|
||||
put_doubleword(x >> 32);
|
||||
}
|
||||
|
||||
void beg_for_bookmark(const char* name, int LINE_of_this){
|
||||
unresolved_references.push_back({result.size(), name, LINE_of_this});
|
||||
put_quadword(0);
|
||||
}
|
||||
};
|
||||
|
||||
#define msh_put_instr(ename) daCtx.put_byte(regex024_opcodes::ename);
|
||||
#define msh_put_sslot(ssid) daCtx.put_doubleword(ssid);
|
||||
#define msh_put_track_arr_ind(i) daCtx.put_word(i);
|
||||
#define msh_put_x(meth, x) daCtx.meth(x);
|
||||
/* Here my assembler begs for bookmark to jump on */
|
||||
#define msh_bookmark_reference(name) daCtx.beg_for_bookmark(name, __LINE__);
|
||||
|
||||
#define s_BEGIN_ASSEMBLER_CONTEXT() { assembler_context daCtx{};
|
||||
#define s_END_ASSEMBLER_CONTEXT(pass_vec) daCtx.resolve_references(); (pass_vec) = std::move(daCtx.result); }
|
||||
|
||||
/* Here user declares a bookmark */
|
||||
#define c_BOOKMARK(name) daCtx.declare_bookmark((name), __LINE__);
|
||||
|
||||
#define i_READ(ssid) msh_put_instr(READ) msh_put_sslot(ssid)
|
||||
#define i_READZ() msh_put_instr(READZ)
|
||||
#define i_JUMP(bookmark) msh_put_instr(JUMP) msh_bookmark_reference(bookmark)
|
||||
|
||||
#define msh_conditional_jump(condition, size_postfix, meth, x, bookmark) \
|
||||
msh_put_instr(JC ## condition ## _ ## size_postfix) \
|
||||
msh_put_x(meth, x) msh_bookmark_reference(bookmark)
|
||||
|
||||
#define i_JCEQUAL_B(x, bookmark) msh_conditional_jump(EQUAL, B, put_byte, x, bookmark)
|
||||
#define i_JCEQUAL_W(x, bookmark) msh_conditional_jump(EQUAL, W, put_word, x, bookmark)
|
||||
#define i_JCEQUAL_DW(x, bookmark) msh_conditional_jump(EQUAL, DW, put_doubleword, x, bookmark)
|
||||
#define i_JCEQUAL_QW(x, bookmark) msh_conditional_jump(EQUAL, QW, put_quadword, x, bookmark)
|
||||
|
||||
#define i_JCLESS_B(x, bookmark) msh_conditional_jump(LESS, B, put_byte, x, bookmark)
|
||||
#define i_JCLESS_W(x, bookmark) msh_conditional_jump(LESS, W, put_word, x, bookmark)
|
||||
#define i_JCLESS_DW(x, bookmark) msh_conditional_jump(LESS, DW, put_doubleword, x, bookmark)
|
||||
#define i_JCLESS_QW(x, bookmark) msh_conditional_jump(LESS, QW, put_quadword, x, bookmark)
|
||||
|
||||
#define i_JCGRTR_B(x, bookmark) msh_conditional_jump(GRTR, B, put_byte, x, bookmark)
|
||||
#define i_JCGRTR_W(x, bookmark) msh_conditional_jump(GRTR, W, put_word, x, bookmark)
|
||||
#define i_JCGRTR_DW(x, bookmark) msh_conditional_jump(GRTR, DW, put_doubleword, x, bookmark)
|
||||
#define i_JCGRTR_QW(x, bookmark) msh_conditional_jump(GRTR, QW, put_quadword, x, bookmark)
|
||||
|
||||
#define i_FORK(ssid, bookmark) msh_put_instr(FORK) msh_put_sslot(ssid) msh_bookmark_reference(bookmark)
|
||||
#define i_MATCH() msh_put_instr(MATCH)
|
||||
#define i_DIE() msh_put_instr(DIE)
|
||||
#define i_PARAM_READ_SS_NUMBER(ssid) msh_put_instr(PARAM_READ_SS_NUMBER) msh_put_sslot(ssid)
|
||||
#define i_PARAM_FORK_SS_NUMBER(ssid) msh_put_instr(PARAM_FORK_SS_NUMBER) msh_put_sslot(ssid)
|
||||
#define i_PARAM_SELARR_LEN(tai) msh_put_instr(PARAM_SELARR_LEN) msh_put_track_arr_ind(tai)
|
||||
#define i_PARAM_COLSIFTFUNC_SET(bookmark) msh_put_instr(PARAM_COLSIFTFUNC_SET) msh_bookmark_reference(bookmark)
|
||||
#define i_PARAM_COLSIFTFUNC_WIPE() msh_put_instr(PARAM_COLSIFTFUNC_WIPE)
|
||||
#define i_MSG_MULTISTART_ALLOWED(is_allowed) msh_put_instr(MSG_MULTISTART_ALLOWED) daCtx.put_byte(is_allowed);
|
||||
#define i_MSG_FED_INPUT_EXTENDED(left, right, part) msh_put_instr(MSG_FED_INPUT_EXTENDED) \
|
||||
daCtx.put_byte(left); daCtx.put_byte(right); msh_put_sslot(part)
|
||||
#define i_DMOV_RABX_SELARR(tai) msh_put_instr(DMOV_RABX_SELARR) msh_put_track_arr_ind(tai)
|
||||
#define DDIST_RABX_SELARR(tai_beg, tai_end) msh_put_instr(DDIST_RABX_SELARR) \
|
||||
msh_put_track_arr_ind(tai_beg) msh_put_track_arr_ind(tai_end)
|
||||
#define i_SIFTPRIOR_MIN_RABX() msh_put_instr(SIFTPRIOR_MIN_RABX)
|
||||
#define i_SIFTPRIOR_MAX_RABX() msh_put_instr(SIFTPRIOR_MAX_RABX)
|
||||
#define i_SIFT_DONE() msh_put_instr(SIFT_DONE)
|
||||
#define i_MOV_COLARR_IMM(tai, qw_x) msh_put_instr(MOV_COLARR_IMM) msh_put_track_arr_ind(tai) \
|
||||
daCtx.put_quadword(qw_x);
|
||||
#define i_MOV_COLARR_BTPOS(tai) msh_put_instr(MOV_COLARR_BTPOS) msh_put_track_arr_ind(tai)
|
||||
#define i_MOV_SELARR_IMM(tai, qw_x) msh_put_instr(MOV_SELARR_IMM) msh_put_track_arr_ind(tai) \
|
||||
daCtx.put_quadword(qw_x);
|
||||
#define i_MOV_SELARR_CHPOS(tai) msh_put_instr(MOV_SELARR_CHPOS)
|
||||
#define i_INIT() msh_put_instr(INIT)
|
||||
#define i_THROW() msh_put_instr(THROW)
|
||||
#define i_DEBUG() msh_put_instr(DEBUG)
|
||||
|
||||
#endif //LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H
|
205
src/libregexis024test/byte_code_disassembler.h
Normal file
205
src/libregexis024test/byte_code_disassembler.h
Normal file
@ -0,0 +1,205 @@
|
||||
/* This file is used for testing purposes only. Do not copy this file to installation prefix. */
|
||||
#ifndef LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H
|
||||
#define LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H
|
||||
|
||||
#include "vibe_check.h"
|
||||
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
// TODO: apply here my new change in near pointer size
|
||||
|
||||
struct landing_place_resolvance{
|
||||
size_t name_id;
|
||||
bool visited = false;
|
||||
landing_place_resolvance() = default;
|
||||
landing_place_resolvance(size_t nameId, bool visited) : name_id(nameId), visited(visited) {}
|
||||
};
|
||||
|
||||
void print_disassembly(size_t prgSize, uint8_t* prg){
|
||||
std::vector<std::string> names = {
|
||||
"Александр", "Мария", "Иван", "Анна", "Дмитрий", "Екатерина", "Алексей",
|
||||
"Ольга", "Михаил", "София", "Сергей", "Анастасия", "Артем", "Виктория",
|
||||
"Андрей", "Елена", "Максим", "Алиса", "Павел", "Наталья", "Денис", "Юлия",
|
||||
"Владимир", "Маргарита", "Никита", "Дарья", "Илья", "Алина", "Роман", "Евгения",
|
||||
"Кирилл", "Елизавета", "Антон", "Татьяна", "Владислав", "Валерия", "Георгий",
|
||||
"Ксения", "Арсений", "Милана", "Даниил", "Вероника", "Тимофей", "Арина",
|
||||
"Николай", "Кристина", "Степан", "Алёна", "Игорь", "Алла", "Григорий", "Ева",
|
||||
"Олег", "Яна", "Семен", "Марина", "Федор", "Светлана", "Василий", "Людмила"
|
||||
};
|
||||
uint64_t used_names = 0;
|
||||
/* From program position -> to names[ind] & */
|
||||
std::map<regex_near_ptr_t, landing_place_resolvance> bookmarks;
|
||||
regex_near_ptr_t IP = 0;
|
||||
|
||||
auto check_inboundness = [&](int region){
|
||||
if (!vmprog_check_inboundness(prgSize, IP, region))
|
||||
exitf("This program can't be decomposed into commands in a trivial way");
|
||||
};
|
||||
auto extract_b = [&]() -> uint8_t{
|
||||
check_inboundness(1);
|
||||
return vmprog_extract_b(&IP, prg);
|
||||
};
|
||||
auto extract_w = [&]() -> uint16_t {
|
||||
check_inboundness(2);
|
||||
return vmprog_extract_w(&IP, prg);
|
||||
};
|
||||
auto extract_dw = [&]() -> uint32_t {
|
||||
check_inboundness(4);
|
||||
return vmprog_extract_dw(&IP, prg);
|
||||
};
|
||||
auto extract_qw = [&]() -> uint64_t {
|
||||
check_inboundness(8);
|
||||
return vmprog_extract_qw(&IP, prg);
|
||||
};
|
||||
auto extract_instruction = [&]() -> uint8_t{
|
||||
return extract_b();
|
||||
};
|
||||
auto extract_sslot_id = [&]() -> regex_sslot_id_t{
|
||||
return extract_dw();
|
||||
};
|
||||
auto extract_near_pointer = [&]() -> regex_near_ptr_t{
|
||||
return extract_qw();
|
||||
};
|
||||
auto extract_track_array_index = [&]() -> regex_tai_t{
|
||||
return extract_w();
|
||||
};
|
||||
|
||||
bool second_phase = false;
|
||||
|
||||
auto fph_register_landing = [&](regex_near_ptr_t pos){
|
||||
if (!second_phase){
|
||||
if (bookmarks.count(pos) == 0){
|
||||
if (used_names == names.size())
|
||||
names.push_back("Закладка_" + std::to_string(used_names));
|
||||
bookmarks.insert({pos, {used_names, false}});
|
||||
used_names++;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto get_bookmark_in_2phase = [&](regex_near_ptr_t pos) -> std::string {
|
||||
if (bookmarks.count(pos) == 0)
|
||||
exitf("bruh");
|
||||
return names[bookmarks[pos].name_id];
|
||||
};
|
||||
|
||||
auto one_reading = [&](){
|
||||
while (IP < prgSize) {
|
||||
regex_near_ptr_t start_pos = IP;
|
||||
if (second_phase){
|
||||
if (bookmarks.count(IP) != 0){
|
||||
printf("%s:\n", get_bookmark_in_2phase(IP).c_str());
|
||||
bookmarks[IP].visited = true;
|
||||
}
|
||||
}
|
||||
uint8_t opcode = extract_instruction();
|
||||
switch (opcode) {
|
||||
#define secPrint(fmt, ...) if (second_phase) {printf("% 3lu) " fmt, start_pos, __VA_ARGS__);} } break;
|
||||
#define secPrintNoArg(str) if (second_phase) {printf("% 3lu) " str, start_pos);} } break;
|
||||
#define instCase(oper_code) case regex024_opcodes::oper_code: {
|
||||
#define jcMess(cond, sz_uppercase, x_t, extract_method, printf_sign) \
|
||||
instCase(JC ## cond ## _ ## sz_uppercase) \
|
||||
x_t x = extract_method(); \
|
||||
regex_near_ptr_t dest = extract_near_pointer(); \
|
||||
fph_register_landing(dest); \
|
||||
secPrint("JC" #cond "_" #sz_uppercase " %" printf_sign " $%s\n", x, get_bookmark_in_2phase(dest).c_str())
|
||||
#define jcCacaphony(cond) \
|
||||
jcMess(cond, B, uint8_t, extract_b, PRIu8) \
|
||||
jcMess(cond, W, uint16_t, extract_w, PRIu16) \
|
||||
jcMess(cond, DW, uint32_t, extract_dw, PRIu32) \
|
||||
jcMess(cond, QW, uint64_t, extract_qw, PRIu64)
|
||||
#define simpleDimple(name) instCase(name) secPrintNoArg(#name "\n")
|
||||
|
||||
instCase(READ)
|
||||
uint32_t ssid = extract_sslot_id();
|
||||
secPrint("READ %u\n", ssid)
|
||||
simpleDimple(READZ)
|
||||
instCase(JUMP)
|
||||
uint32_t dest = extract_near_pointer();
|
||||
fph_register_landing(dest);
|
||||
secPrint("JUMP $%s\n", get_bookmark_in_2phase(dest).c_str())
|
||||
|
||||
jcCacaphony(EQUAL)
|
||||
jcCacaphony(LESS)
|
||||
jcCacaphony(GRTR)
|
||||
|
||||
instCase(FORK)
|
||||
uint32_t ssid = extract_sslot_id();
|
||||
regex_near_ptr_t dest = extract_near_pointer();
|
||||
fph_register_landing(dest);
|
||||
secPrint("FORK %u $%s\n", ssid, get_bookmark_in_2phase(dest).c_str())
|
||||
simpleDimple(MATCH)
|
||||
simpleDimple(DIE)
|
||||
instCase(PARAM_READ_SS_NUMBER)
|
||||
regex_sslot_id_t ssid_max_plus_one = extract_sslot_id();
|
||||
secPrint("PARAM_READ_SS_NUMBER %u\n", ssid_max_plus_one)
|
||||
instCase(PARAM_FORK_SS_NUMBER)
|
||||
regex_sslot_id_t ssid_max_plus_one = extract_sslot_id();
|
||||
secPrint("PARAM_FORK_SS_NUMBER %u\n", ssid_max_plus_one)
|
||||
instCase(PARAM_SELARR_LEN)
|
||||
regex_tai_t tai_max_plus_one = extract_track_array_index();
|
||||
secPrint("PARAM_SELARR_LEN %hu\n", tai_max_plus_one)
|
||||
instCase(PARAM_COLSIFTFUNC_SET)
|
||||
regex_near_ptr_t entry = extract_near_pointer();
|
||||
fph_register_landing(entry);
|
||||
secPrint("PARAM_COLSIFTFUNC_SET $%s\n", get_bookmark_in_2phase(entry).c_str())
|
||||
simpleDimple(PARAM_COLSIFTFUNC_WIPE)
|
||||
instCase(MSG_MULTISTART_ALLOWED)
|
||||
uint8_t is_allowed = extract_b();
|
||||
secPrint("MSG_MULTISTART_ALLOWED %hhu\n", is_allowed)
|
||||
instCase(MSG_FED_INPUT_EXTENDED)
|
||||
uint8_t left = extract_b();
|
||||
uint8_t right = extract_b();
|
||||
regex_sslot_id_t part = extract_sslot_id();
|
||||
secPrint("MSG_FED_INPUT_EXTENDED %hhu %hhu %u\n", left, right, part)
|
||||
instCase(DMOV_RABX_SELARR)
|
||||
regex_tai_t i = extract_track_array_index();
|
||||
secPrint("DMOV_RABX_SELARR %hu\n", i)
|
||||
instCase(DDIST_RABX_SELARR)
|
||||
regex_tai_t s = extract_track_array_index();
|
||||
regex_tai_t e = extract_track_array_index();
|
||||
secPrint("DDIST_RABX_SELARR %hu %hu\n", s, e);
|
||||
simpleDimple(SIFTPRIOR_MIN_RABX)
|
||||
simpleDimple(SIFTPRIOR_MAX_RABX)
|
||||
simpleDimple(SIFT_DONE)
|
||||
instCase(MOV_COLARR_IMM)
|
||||
regex_tai_t tai = extract_track_array_index();
|
||||
uint64_t imm = extract_qw();
|
||||
secPrint("MOV_COLARR_IMM %hu %lu\n", tai, imm);
|
||||
instCase(MOV_COLARR_BTPOS)
|
||||
regex_tai_t tai = extract_track_array_index();
|
||||
secPrint("MOV_COLARR_BTPOS %hu\n", tai);
|
||||
instCase(MOV_SELARR_IMM)
|
||||
regex_tai_t tai = extract_track_array_index();
|
||||
uint64_t imm = extract_qw();
|
||||
secPrint("MOV_SELARR_IMM %hu %lu\n", tai, imm);
|
||||
instCase(MOV_SELARR_CHPOS)
|
||||
regex_tai_t tai = extract_track_array_index();
|
||||
secPrint("MOV_SELARR_CHPOS %hu\n", tai);
|
||||
simpleDimple(INIT)
|
||||
simpleDimple(THROW)
|
||||
default:
|
||||
exitf("Bad opcode\n");
|
||||
#undef secPrint
|
||||
#undef secPrintNoArg
|
||||
#undef instCase
|
||||
#undef jcMess
|
||||
#undef jcCacaphony
|
||||
#undef simpleDimple
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
one_reading();
|
||||
second_phase = true;
|
||||
IP = 0;
|
||||
one_reading();
|
||||
}
|
||||
|
||||
#endif //LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H
|
52
src/libregexis024test/test0.cpp
Normal file
52
src/libregexis024test/test0.cpp
Normal file
@ -0,0 +1,52 @@
|
||||
#include <libregexis024fa/codeset.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <stdio.h>
|
||||
|
||||
void test_ccs_fnc(const codeset_t &got, const codeset_t &expected){
|
||||
static int id = 1;
|
||||
if (got == expected)
|
||||
printf("Test %d passed\n", id++);
|
||||
else
|
||||
exitf("Test %d failed\n", id);
|
||||
}
|
||||
|
||||
void invert_test(const codeset_t& A, const codeset_t& C){
|
||||
test_ccs_fnc(invert_set(A), C);
|
||||
test_ccs_fnc(invert_set(C), A);
|
||||
}
|
||||
|
||||
void merge_test(const codeset_t& A, const codeset_t& B, const codeset_t& C){
|
||||
test_ccs_fnc(merge_sets(A, A), A);
|
||||
test_ccs_fnc(merge_sets(B, B), B);
|
||||
test_ccs_fnc(merge_sets(A, B), C);
|
||||
test_ccs_fnc(merge_sets(B, A), C);
|
||||
}
|
||||
|
||||
void intersect_test(const codeset_t& A, const codeset_t& B, const codeset_t& C){
|
||||
test_ccs_fnc(intersect_sets(A, A), A);
|
||||
test_ccs_fnc(intersect_sets(B, B), B);
|
||||
test_ccs_fnc(intersect_sets(A, B), C);
|
||||
test_ccs_fnc(intersect_sets(B, A), C);
|
||||
}
|
||||
|
||||
int main(){
|
||||
merge_test({{34, 111}}, {}, {{34, 111}});
|
||||
merge_test({{1, 1}}, {{3, 3}}, {{1, 1}, {3, 3}});
|
||||
invert_test({{0, 1}}, {{2, UINT32_MAX}});
|
||||
invert_test({{32, 34}}, {{0, 31}, {35, UINT32_MAX}});
|
||||
merge_test({{10, 10}, {20, 20}}, {{19, 19}}, {{10, 10}, {19, 20}});
|
||||
merge_test({{0, 5}, {7, 10}}, {{4, 6}}, {{0, 10}});
|
||||
merge_test({{1, 10}, {50, 60}}, {{11, 70}}, {{1, 70}});
|
||||
merge_test({{23, 23}, {56, 100}}, {{30, 55}}, {{23, 23}, {30, 100}});
|
||||
intersect_test({{100, 200}, {300, 400}}, {}, {});
|
||||
intersect_test({{2, 30}}, {{15, 50}}, {{15, 30}});
|
||||
intersect_test({{10, 30}}, {{15, 25}}, {{15, 25}});
|
||||
intersect_test({{10, 20}}, {{21, 30}}, {});
|
||||
intersect_test({{1, 100}, {150, 200}}, {{50, 175}}, {{50, 100}, {150, 175}});
|
||||
intersect_test({{1, 100}}, {}, {});
|
||||
intersect_test({{50, 50}}, {{50, 50}}, {{50, 50}});
|
||||
intersect_test({{49, 49}}, {{50, 50}}, {});
|
||||
intersect_test({{1, 20}, {50, 80}}, {{10, 55}, {60, 100}}, {{10, 20}, {50, 55}, {60, 80}});
|
||||
merge_test({{2, 3}, {5, 5}, {7, 7}}, {{1, 1}, {3, 7}}, {{1, 7}});
|
||||
return 0;
|
||||
}
|
136
src/libregexis024test/test1.cpp
Normal file
136
src/libregexis024test/test1.cpp
Normal file
@ -0,0 +1,136 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include <libregexis024vm/libregexis024vm.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024test/byte_code_assembler.h>
|
||||
#include <libregexis024test/byte_code_disassembler.h>
|
||||
#include <assert.h>
|
||||
|
||||
static int test_id = 0;
|
||||
|
||||
void do_test(const std::vector<uint8_t>& prg, const std::string& str, const std::vector<bool>& prefix_matching){
|
||||
assert(str.size() + 1 == prefix_matching.size());
|
||||
REGEX_IS024_CONTEXT ctx{prg.size(), prg.data(), 0, 0, 1000, 1000, 1000000};
|
||||
regex024_error_code ret;
|
||||
// todo
|
||||
printf("TEST %d passed\n", test_id);
|
||||
test_id++;
|
||||
}
|
||||
|
||||
std::vector<bool> tfff(size_t n){
|
||||
std::vector<bool> res = std::vector<bool>(n + 1, false);
|
||||
res[0] = true;
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<bool> ffft(size_t n){
|
||||
std::vector<bool> res = std::vector<bool>(n + 1, false);
|
||||
res.back() = true;
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> program_0(){
|
||||
std::vector<uint8_t> res;
|
||||
s_BEGIN_ASSEMBLER_CONTEXT()
|
||||
i_PARAM_READ_SS_NUMBER(2)
|
||||
i_PARAM_FORK_SS_NUMBER(1)
|
||||
i_INIT()
|
||||
i_FORK(0, "wb")
|
||||
i_READ(0)
|
||||
i_JCEQUAL_B('a', "finish")
|
||||
i_DIE()
|
||||
|
||||
c_BOOKMARK("wb")
|
||||
i_READ(1)
|
||||
i_JCEQUAL_B('b', "finish")
|
||||
i_DIE()
|
||||
|
||||
c_BOOKMARK("finish")
|
||||
i_MATCH()
|
||||
i_DIE()
|
||||
s_END_ASSEMBLER_CONTEXT(res)
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> program_1(){
|
||||
std::vector<uint8_t> res;
|
||||
s_BEGIN_ASSEMBLER_CONTEXT()
|
||||
i_PARAM_READ_SS_NUMBER(4)
|
||||
i_PARAM_FORK_SS_NUMBER(2)
|
||||
// i_PARAM_SELARR_LEN(0)
|
||||
i_INIT()
|
||||
i_FORK(0, "wb")
|
||||
c_BOOKMARK("wa")
|
||||
i_READ(0)
|
||||
i_JCEQUAL_B('a', "razvilka")
|
||||
i_DIE()
|
||||
|
||||
c_BOOKMARK("wb")
|
||||
i_READ(1)
|
||||
i_JCEQUAL_B('b', "razvilka")
|
||||
i_DIE()
|
||||
|
||||
c_BOOKMARK("razvilka")
|
||||
i_FORK(1, "wd")
|
||||
c_BOOKMARK("wc")
|
||||
i_READ(2)
|
||||
i_JCEQUAL_B('c', "finish")
|
||||
i_DIE()
|
||||
|
||||
c_BOOKMARK("wd")
|
||||
i_READ(3)
|
||||
i_JCEQUAL_B('d', "finish")
|
||||
i_DIE()
|
||||
|
||||
c_BOOKMARK("finish")
|
||||
i_MATCH()
|
||||
i_DIE()
|
||||
s_END_ASSEMBLER_CONTEXT(res)
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
int main(){
|
||||
auto prg0 = program_0();
|
||||
auto prg1 = program_1();
|
||||
// printf("Disassembled program:\n");
|
||||
// print_disassembly(prg.size(), prg.data());
|
||||
|
||||
printf("Testing starts\n");
|
||||
test_id = 0;
|
||||
do_test(prg0, "a", {false, true});
|
||||
do_test(prg0, "b", {false, true});
|
||||
do_test(prg0, "c", {false, false});
|
||||
do_test(prg0, "a4", {false, true, false});
|
||||
do_test(prg0, "b4", {false, true, false});
|
||||
do_test(prg1, "aa", {false, false, false});
|
||||
do_test(prg1, "db", {false, false, false});
|
||||
do_test(prg1, "ac", {false, false, true});
|
||||
do_test(prg1, "bc", {false, false, true});
|
||||
do_test(prg1, "ad", {false, false, true});
|
||||
do_test(prg1, "bd", {false, false, true});
|
||||
do_test(prg1, "bd12", {false, false, true, false, false});
|
||||
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int main() {
|
||||
std::vector<uint8_t> prg;
|
||||
s_BEGIN_ASSEMBLER_CONTEXT()
|
||||
c_BOOKMARK("111")
|
||||
i_READ(0)
|
||||
i_READ(1)
|
||||
i_FORK(12, "vdv")
|
||||
c_BOOKMARK("vdv")
|
||||
i_READ(2)
|
||||
i_READ(10000000)
|
||||
i_THROW()
|
||||
i_THROW()
|
||||
i_THROW()
|
||||
i_JUMP("111")
|
||||
|
||||
s_END_ASSEMBLER_CONTEXT(prg)
|
||||
|
||||
print_disassembly(prg.size(), prg.data());
|
||||
}
|
12
src/libregexis024test/test2.cpp
Normal file
12
src/libregexis024test/test2.cpp
Normal file
@ -0,0 +1,12 @@
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024test/byte_code_disassembler.h>
|
||||
|
||||
int main(){
|
||||
std::string regular_expression = "!selarr{boba{ca}}^a#boba(b)c$";
|
||||
REGEX_IS024_MeaningContext regex(regular_expression.size(), regular_expression.c_str());
|
||||
if (regex.error)
|
||||
fprintf(stderr, "%s\n", regex.error_msg.c_str());
|
||||
std::vector<uint8_t> res = regex.compiled_program;
|
||||
print_disassembly(res.size(), res.data());
|
||||
return 0;
|
||||
}
|
217
src/libregexis024test/test3.cpp
Normal file
217
src/libregexis024test/test3.cpp
Normal file
@ -0,0 +1,217 @@
|
||||
#include <libregexis024fa/colored_codeset.h>
|
||||
#include <assert.h>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <random>
|
||||
|
||||
struct test_id_t {
|
||||
int test_id;
|
||||
int subtest_id;
|
||||
|
||||
test_id_t(int test_id, int subtest_id) : test_id(test_id),subtest_id(subtest_id) {}
|
||||
|
||||
std::string toString() const {
|
||||
char buf[128];
|
||||
snprintf(buf, 128, "#%d::%d", test_id + 1, subtest_id + 1);
|
||||
return buf;
|
||||
}
|
||||
};
|
||||
|
||||
std::string stringifyCodeset(const codeset_t& CS) {
|
||||
std::string cs;
|
||||
for (auto p: CS) {
|
||||
if (!cs.empty())
|
||||
cs += "; ";
|
||||
cs += std::to_string(p.first) + "-" + std::to_string(p.second);
|
||||
}
|
||||
return cs;
|
||||
}
|
||||
|
||||
std::string stringifyRequestList(const std::vector<uint64_t>& arr) {
|
||||
std::string rl;
|
||||
for (auto r: arr) {
|
||||
if (!rl.empty())
|
||||
rl += ", ";
|
||||
rl += std::to_string(r);
|
||||
}
|
||||
return rl;
|
||||
}
|
||||
|
||||
void print_obj(const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer) {
|
||||
size_t R = answer.size();
|
||||
for (int i = 0; i < R; i++) {
|
||||
std::string cs = stringifyCodeset(answer[i].first);
|
||||
std::string rl = stringifyRequestList(answer[i].second);
|
||||
printf("{%s} -> {%s}\n", cs.c_str(), rl.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void print_answer_canonic(const std::map<std::vector<size_t>, codeset_t>& answer) {
|
||||
for (auto& p: answer) {
|
||||
printf("{%s} -> {%s}\n", stringifyCodeset(p.second).c_str(), stringifyRequestList(p.first).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void print_test_details(int dummyN, const std::vector<codeset_t>& requests) {
|
||||
for (int i = 0; i < requests.size(); i++) {
|
||||
printf("%d) %s\n", i - dummyN, stringifyCodeset(requests[i]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void fail_test(test_id_t my_test_id) {
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "Test %s failed", my_test_id.toString().c_str());
|
||||
throw std::runtime_error(buf);
|
||||
}
|
||||
|
||||
std::map<std::vector<size_t>, codeset_t> safe_canonificate_answer(test_id_t my_test_id,
|
||||
const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer)
|
||||
{
|
||||
std::map<std::vector<size_t>, codeset_t> answer_canonic;
|
||||
for (auto& p: answer) {
|
||||
if (answer_canonic.count(p.second) == 0)
|
||||
answer_canonic[p.second] = {};
|
||||
if (!intersect_sets(answer_canonic[p.second], p.first).empty())
|
||||
fail_test(my_test_id);
|
||||
answer_canonic[p.second] = merge_sets(answer_canonic[p.second], p.first);
|
||||
}
|
||||
return answer_canonic;
|
||||
}
|
||||
|
||||
std::vector<std::pair<codeset_t, std::vector<size_t>>> safe_zip_answer(test_id_t my_test_id,
|
||||
const std::vector<codeset_t>& ti, const std::vector<std::vector<size_t>>& to)
|
||||
{
|
||||
size_t R = ti.size();
|
||||
std::vector<std::pair<codeset_t, std::vector<size_t>>> res(R);
|
||||
if (to.size() != R)
|
||||
fail_test(my_test_id);
|
||||
for (size_t i = 0; i < R; i++) {
|
||||
res[i].first = ti[i];
|
||||
res[i].second = to[i];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
void perform_test(test_id_t my_test_id, int InpDummyN, const std::vector<codeset_t>& rqInpCs, const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer_right)
|
||||
{
|
||||
std::map<std::vector<size_t>, codeset_t> answer_canonic_right = safe_canonificate_answer(my_test_id, answer_right);
|
||||
ColoredCodeset cc(InpDummyN);
|
||||
|
||||
for (auto& c: rqInpCs)
|
||||
cc.apply_divisor(c);
|
||||
std::vector<codeset_t> ti;
|
||||
std::vector<std::vector<size_t>> to;
|
||||
cc.get_splits_of_non_dummy(ti, to);
|
||||
std::vector<std::pair<codeset_t, std::vector<size_t>>> answer_returned = safe_zip_answer(my_test_id, ti, to);
|
||||
std::map<std::vector<size_t>, codeset_t> answer_canonic_returned = safe_canonificate_answer(my_test_id,
|
||||
answer_returned);
|
||||
if (answer_canonic_right != answer_canonic_returned) {
|
||||
printf("Test failed!!\n");
|
||||
printf("Test details:\n");
|
||||
print_test_details(InpDummyN, rqInpCs);
|
||||
printf("Right answer:\n");
|
||||
print_obj(answer_right);
|
||||
printf("Given answer:\n");
|
||||
print_obj(answer_returned);
|
||||
fail_test(my_test_id);
|
||||
}
|
||||
printf("Test %s passed\n", my_test_id.toString().c_str());
|
||||
}
|
||||
|
||||
void perform_test_with_shuffle(const std::vector<codeset_t>& rqInpCs,
|
||||
const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer)
|
||||
{
|
||||
static int cur_test_id = 0;
|
||||
int my_test_id = cur_test_id++;
|
||||
int dn = rqInpCs.size();
|
||||
std::vector<int> shuf(dn);
|
||||
for (int i = 0; i < dn; i++)
|
||||
shuf[i] = i;
|
||||
for (int stid = 0; stid < 14; stid++) {
|
||||
std::random_device d;
|
||||
std::mt19937 r_gen(d());
|
||||
std::shuffle(shuf.begin(), shuf.end(), r_gen);
|
||||
int InpDummyN = std::uniform_int_distribution<int>(0, dn - 1)(r_gen);
|
||||
// for (int e: shuf)
|
||||
// printf("%d ", e);
|
||||
// printf("\n");
|
||||
std::vector<codeset_t> new_rqInpCs(dn);
|
||||
for (int i = 0; i < dn; i++)
|
||||
new_rqInpCs[shuf[i]] = rqInpCs[i];
|
||||
std::vector<std::pair<codeset_t, std::vector<size_t>>> new_right_answer;
|
||||
for (const std::pair<codeset_t, std::vector<size_t>>& p: answer) {
|
||||
std::vector<uint64_t> new_request_list;
|
||||
for (size_t oldRid: p.second) {
|
||||
size_t unabridged_new_rid = shuf[oldRid];
|
||||
if (unabridged_new_rid >= InpDummyN) {
|
||||
new_request_list.push_back(unabridged_new_rid - InpDummyN);
|
||||
}
|
||||
}
|
||||
if (!new_request_list.empty()) {
|
||||
std::sort(new_request_list.begin(), new_request_list.end());
|
||||
new_right_answer.emplace_back(p.first, new_request_list);
|
||||
}
|
||||
}
|
||||
perform_test(test_id_t(my_test_id, stid), InpDummyN, new_rqInpCs, new_right_answer);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
perform_test_with_shuffle(
|
||||
{
|
||||
{{5, 500}},
|
||||
{{10, 20}},
|
||||
{{5, 10}},
|
||||
},
|
||||
{
|
||||
{{{21, 500}}, {0}},
|
||||
{{{11, 20}}, {0, 1}},
|
||||
{{{5, 9}}, {0, 2}},
|
||||
{{{10, 10}}, {0, 1, 2}},
|
||||
});
|
||||
|
||||
perform_test_with_shuffle({
|
||||
{{10, 19}},
|
||||
{{10, 15}},
|
||||
{{5, 9}},
|
||||
{{20, 40}},
|
||||
{{16, UINT32_MAX}},
|
||||
},
|
||||
{
|
||||
{{{5, 9}}, {2}},
|
||||
{{{10, 15}}, {0, 1}},
|
||||
{{{16, 19}}, {0, 4}},
|
||||
{{{20, 40}}, {3, 4}},
|
||||
{{{41, UINT32_MAX}}, {4}}
|
||||
|
||||
});
|
||||
|
||||
perform_test_with_shuffle(
|
||||
{
|
||||
{{10, 19}, {30, 39}, {50, 69}},
|
||||
{{20, 29}, {40, 59}, },
|
||||
{{20, 39}, {70, 79}},
|
||||
codeset_of_one_char(UINT32_MAX - 1),
|
||||
codeset_of_one_char(UINT32_MAX),
|
||||
{{UINT32_MAX - 1, UINT32_MAX}},
|
||||
codeset_of_one_char(0),
|
||||
codeset_of_one_char(1),
|
||||
{{0, 1}},
|
||||
},
|
||||
{
|
||||
{{{10, 19}, {60, 69}}, {0}},
|
||||
{{{40, 49}}, {1}},
|
||||
{{{70, 79}}, {2}},
|
||||
{{{50, 59}}, {0, 1}},
|
||||
{{{20, 29}}, {1, 2}},
|
||||
{{{30, 39}}, {0, 2}},
|
||||
{{{0, 0}}, {6, 8}},
|
||||
{{{1, 1}}, {7, 8}},
|
||||
{{{UINT32_MAX - 1, UINT32_MAX - 1}}, {3, 5}},
|
||||
{{{UINT32_MAX, UINT32_MAX}}, {4, 5}},
|
||||
});
|
||||
return 0;
|
||||
}
|
43
src/libregexis024test/test4.cpp
Normal file
43
src/libregexis024test/test4.cpp
Normal file
@ -0,0 +1,43 @@
|
||||
#include <libregexis024tools/stringmatching.h>
|
||||
#include <assert.h>
|
||||
#include <stdexcept>
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
|
||||
using namespace regexis024;
|
||||
using namespace std;
|
||||
|
||||
void test(const string& input, const string& pattern, const MatchInfo& right_answer) {
|
||||
MatchInfo given_answer;
|
||||
track_var_list retTrackVarList;
|
||||
string retStatus;
|
||||
matchStrToRegexp(input, pattern, given_answer, retTrackVarList, retStatus);
|
||||
if (given_answer != right_answer) {
|
||||
throw runtime_error("Test failed");
|
||||
}
|
||||
printf("Test passed\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
test("b", "#boba(b)", MatchInfo({{0, 0}, {1, 1}}, {}));
|
||||
test("abc", "!selarr{boba{ca}}^a#boba(b)c$", MatchInfo({{0, 1}, {1, 2}}, {1, 2}));
|
||||
for (int i = 0; i < 64; i++) {
|
||||
std::string T;
|
||||
T += ('a' + (i >> 3));
|
||||
T+= ('a' + (i % 8));
|
||||
test(T, "(((a|b)|(c|d))|((e|f)|(g|h)))!r{2}", MatchInfo({}, {}));
|
||||
}
|
||||
test("abba", "!select{M{max}}a#M(b*)a", MatchInfo({}, {1, 3}));
|
||||
test("abba", "!dfa;!select{M{max}}a#M(b*)a", MatchInfo({}, {1, 3}));
|
||||
test("abba", "!select{M{max}}a#M(!any;*)a", MatchInfo({}, {1, 3}));
|
||||
test("abba", "!dfa;!select{M{max}}a#M(!any;*)a", MatchInfo({}, {1, 3}));
|
||||
test("", "", MatchInfo({}, {}));
|
||||
test("a", "a", MatchInfo({}, {}));
|
||||
test("a3", "[abc]3", MatchInfo({}, {}));
|
||||
test("b3", "[abc]3", MatchInfo({}, {}));
|
||||
test("c3", "[abc]3", MatchInfo({}, {}));
|
||||
test("aa", "aa", MatchInfo({}, {}));
|
||||
test("aaaaa", "a*", MatchInfo({}, {}));
|
||||
test("bababbaa", "[ab]*", MatchInfo({}, {}));
|
||||
test("bababbaa", "!dfa;[ab]*", MatchInfo({}, {}));
|
||||
return 0;
|
||||
}
|
14
src/libregexis024test/vibe_check.h
Normal file
14
src/libregexis024test/vibe_check.h
Normal file
@ -0,0 +1,14 @@
|
||||
#ifndef VIBE_CHECK_H
|
||||
#define VIBE_CHECK_H
|
||||
|
||||
#ifndef __ORDER_LITTLE_ENDIAN__
|
||||
#error "All the cool kids use little endian. Get lost, you are forbidden from entering this party"
|
||||
#endif
|
||||
|
||||
#ifndef _GLIBCXX_DEBUG
|
||||
#error "Kinda stupid to test without _GLIBCXX_DEBIG. Or... Don't tell me you are using this header in non-testing environment. OH MY \
|
||||
GOD! THIS LUNATIC USES TESTING HEADER IN PRODUCTION CODE. I-I-I am calling 911, COMON, SOMEBODY CATCH HIM AND PUT HIM IN LOONEYBLOCK!!!"
|
||||
#endif
|
||||
|
||||
|
||||
#endif //VIBE_CHECK_H
|
109
src/libregexis024tools/stringmatching.cpp
Normal file
109
src/libregexis024tools/stringmatching.cpp
Normal file
@ -0,0 +1,109 @@
|
||||
#include <algorithm>
|
||||
#include <libregexis024tools/stringmatching.h>
|
||||
#include <libregexis024sol/expr_compiler.h>
|
||||
#include <libregexis024vm/libregexis024vm_interface.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <assert.h>
|
||||
|
||||
// using namespace regexis024;
|
||||
|
||||
void convert(regexis024::TrackingVariableInfo& to, const SubtrackingNameInfo& from) {
|
||||
#define plagiat(field) to.field = from.field;
|
||||
plagiat(type);
|
||||
plagiat(colarr_first);
|
||||
plagiat(colarr_second);
|
||||
plagiat(stored_in_ca);
|
||||
plagiat(selarr_first);
|
||||
plagiat(selarr_second);
|
||||
plagiat(stored_in_sa);
|
||||
#undef plagiat
|
||||
}
|
||||
|
||||
int regexis024::matchStrToRegexp(const std::string& input, const std::string& pattern,
|
||||
MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus)
|
||||
{
|
||||
retTrackVarList = {};
|
||||
retMatchInfo = MatchInfo();
|
||||
retStatus = "";
|
||||
REGEX_IS024_MeaningContext regexp(pattern.size(), pattern.data());
|
||||
if (regexp.error) {
|
||||
retStatus = "Pattern compilation. " + regexp.error_msg;
|
||||
return -1;
|
||||
}
|
||||
retTrackVarList = {};
|
||||
for (auto& iip: regexp.ktr.track_names) {
|
||||
convert(retTrackVarList[iip.first], regexp.ktr.retrieval_info[iip.second]);
|
||||
}
|
||||
REGEX_IS024_VirtualMachine vm(regexp.compiled_program.size(), regexp.compiled_program.data(),
|
||||
UINT64_MAX, UINT16_MAX,
|
||||
UINT32_MAX, UINT32_MAX, UINT64_MAX);
|
||||
auto getVMErrString = [&]() -> std::string {
|
||||
return std::string(regex024_error_code_tostr(vm.getErrno()));
|
||||
};
|
||||
|
||||
if (vm.initialize() != regex024_error_codes::stable) {
|
||||
retStatus = "Virtual machine initialization. " + getVMErrString();
|
||||
return -1;
|
||||
}
|
||||
int left_ext_feed = vm.getInputLeftExtensionSize();
|
||||
int right_ext_feed = vm.getInputRightExtensionSize();
|
||||
if (left_ext_feed > 1 || right_ext_feed > 1) {
|
||||
retStatus = "Unnatural extended input request.";
|
||||
return -1;
|
||||
}
|
||||
if (vm.addNewMatchingThread() != regex024_error_codes::stable) {
|
||||
retStatus = "Virtual machine first kick. " + getVMErrString();
|
||||
}
|
||||
if (left_ext_feed) {
|
||||
if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) {
|
||||
retStatus = "VM left extended input. " + getVMErrString();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
for (size_t cur_text_pos = 0;cur_text_pos < input.size();) {
|
||||
int32_t inp_code;
|
||||
size_t adj;
|
||||
utf8_string_iterat(inp_code, adj, cur_text_pos, reinterpret_cast<const uint8_t*>(input.data()), input.size());
|
||||
if (inp_code < 0) {
|
||||
retStatus = "Input string encoding error.";
|
||||
return -1;
|
||||
}
|
||||
if (vm.feedCharacter(static_cast<uint64_t>(inp_code), adj) != regex024_error_codes::stable) {
|
||||
retStatus = "VM input. " + getVMErrString();
|
||||
return -1;
|
||||
}
|
||||
cur_text_pos += adj;
|
||||
}
|
||||
if (right_ext_feed) {
|
||||
if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) {
|
||||
retStatus = "VM right extended input. " + getVMErrString();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
assert(vm.isUsable());
|
||||
if (vm.isMatched()) {
|
||||
retMatchInfo.have_match = true;
|
||||
size_t SN1 = vm.getSelectionArrayLength();
|
||||
retMatchInfo.sa.assign(SN1, 0);
|
||||
for (size_t i = 0; i < SN1; i++)
|
||||
retMatchInfo.sa[i] = vm.getMatchedThreadSAValue(i);
|
||||
retMatchInfo.ca_history = vm.getMatchedThreadCABranchReverse();
|
||||
std::reverse(retMatchInfo.ca_history.begin(), retMatchInfo.ca_history.end());
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool regexis024::MatchInfo::operator==(const MatchInfo &other) const {
|
||||
if (!have_match && !other.have_match)
|
||||
return true;
|
||||
return (have_match == other.have_match) && (sa == other.sa) && (ca_history == other.ca_history);
|
||||
}
|
||||
|
||||
bool regexis024::MatchInfo::operator!=(const MatchInfo &other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
regexis024::MatchInfo::MatchInfo(const std::vector<REGEX_IS024_CAEvent> &ca_history, const std::vector<uint64_t> &sa):
|
||||
ca_history(ca_history), sa(sa), have_match(true) {
|
||||
}
|
42
src/libregexis024tools/stringmatching.h
Normal file
42
src/libregexis024tools/stringmatching.h
Normal file
@ -0,0 +1,42 @@
|
||||
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024TOOLS_STRINGMATCHING_H
|
||||
#define LIBREGEXIS024_SRC_LIBREGEXIS024TOOLS_STRINGMATCHING_H
|
||||
|
||||
#include <libregexis024fa/tracking_variables.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <libregexis024vm/libregexis024vm_interface.h>
|
||||
|
||||
namespace regexis024 {
|
||||
struct TrackingVariableInfo {
|
||||
bool stored_in_ca = true;
|
||||
bool stored_in_sa = false;
|
||||
|
||||
tracking_var_type type;
|
||||
/* These fields will be -1 if unused */
|
||||
int colarr_first = -1;
|
||||
int colarr_second = -1;
|
||||
|
||||
int selarr_first = -1;
|
||||
int selarr_second = -1;
|
||||
};
|
||||
|
||||
typedef std::map<std::string, TrackingVariableInfo> track_var_list;
|
||||
|
||||
struct MatchInfo {
|
||||
bool have_match = false;
|
||||
std::vector<REGEX_IS024_CAEvent> ca_history;
|
||||
std::vector<uint64_t> sa;
|
||||
|
||||
bool operator==(const MatchInfo& other) const ;
|
||||
bool operator!=(const MatchInfo& other) const ;
|
||||
|
||||
MatchInfo() = default;
|
||||
|
||||
MatchInfo(const std::vector<REGEX_IS024_CAEvent> &ca_history, const std::vector<uint64_t> &sa);
|
||||
};
|
||||
|
||||
int matchStrToRegexp(const std::string& input, const std::string& pattern,
|
||||
MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus);
|
||||
}
|
||||
|
||||
#endif
|
491
src/libregexis024vm/instruction_implementation.cpp
Normal file
491
src/libregexis024vm/instruction_implementation.cpp
Normal file
@ -0,0 +1,491 @@
|
||||
#include <libregexis024vm/instruction_implementation.h>
|
||||
#include <stdexcept>
|
||||
|
||||
void swap_old_settled_and_new_active(REGEX_IS024_CONTEXT &ctx, REGEX_IS024_Thread& old_settled){
|
||||
ctx_print_debug(ctx);
|
||||
assert(old_settled.slot_occupation_status == SLOT_OCCUPIED_val);
|
||||
REGEX_IS024_Thread temp = old_settled;
|
||||
old_settled = ctx.active_thread;
|
||||
old_settled.slot_occupation_status = SLOT_NEW_val;
|
||||
ctx.active_thread = temp;
|
||||
// slot_occupation_status & SLOT_OCCUPIED of actie thread is true, because it was retrieved from old_settled
|
||||
}
|
||||
|
||||
void start_noncloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other){
|
||||
ctx_print_debug(ctx);
|
||||
if (ctx.have_sift_function){
|
||||
ctx.sifting_with = &other;
|
||||
ctx.who_started_sift = regex024_opcode::READ;
|
||||
ctx.intruder_IP = ctx.active_thread.IP;
|
||||
ctx.active_thread.IP = ctx.sift_function;
|
||||
ctx.RAX = ctx.RBX = 0;
|
||||
} else {
|
||||
ctx.active_thread.delete_thread();
|
||||
ctx.try_to_continue_scheduled();
|
||||
}
|
||||
}
|
||||
|
||||
/* The one that drops as an intruder here is current active.thread.IP */
|
||||
void start_cloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other, regex_near_ptr_t clone_IP){
|
||||
ctx_print_debug(ctx);
|
||||
if (ctx.have_sift_function){
|
||||
ctx.sifting_with = &other;
|
||||
ctx.who_started_sift = regex024_opcode::FORK;
|
||||
ctx.intruder_IP = ctx.active_thread.IP;
|
||||
ctx.child_ret_IP = clone_IP;
|
||||
ctx.active_thread.IP = ctx.sift_function;
|
||||
ctx.RAX = ctx.RBX = 0;
|
||||
} else {
|
||||
ctx.active_thread.IP = clone_IP;
|
||||
}
|
||||
}
|
||||
|
||||
#define initialization_phase_check() if (ctx.initialized){ \
|
||||
ctx.error = regex024_error_codes::too_late; return; }
|
||||
#define general_matching_mode_check() if (!ctx.initialized){ \
|
||||
ctx.error = regex024_error_codes::too_early; return; } if(ctx.sifting_with){ \
|
||||
ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; }
|
||||
#define sift_mode_check() if (!ctx.sifting_with){ \
|
||||
ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; }
|
||||
|
||||
/* Can append to both read_halted+new stacks of context */
|
||||
void read_halted_new_type_stacks_append(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid){
|
||||
ctx_print_debug(ctx);
|
||||
if (ssid < ctx.portion_of_FIRST_read_halt_ns){
|
||||
ctx.READ_halted_stack_new_first.append(ssid);
|
||||
} else {
|
||||
ctx.READ_halted_stack_new_second.append(ssid);
|
||||
}
|
||||
}
|
||||
|
||||
void do_i_read(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid) {
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
if (ssid >= ctx.read_slots_number)
|
||||
smitsya(read_sslot_out_of_range);
|
||||
REGEX_IS024_Thread& other = ctx.READ_halted_slots[ssid];
|
||||
if (other.slot_occupation_status & SLOT_OCCUPIED){
|
||||
if (other.slot_occupation_status & SLOT_NEW){
|
||||
start_noncloning_conflict(ctx, other);
|
||||
} else {
|
||||
swap_old_settled_and_new_active(ctx, other);
|
||||
/* Even though ssid was registed in stack for elders, now young stack should also track this slot */
|
||||
read_halted_new_type_stacks_append(ctx, ssid);
|
||||
}
|
||||
} else {
|
||||
other = ctx.active_thread;
|
||||
other.slot_occupation_status = SLOT_NEW_val;
|
||||
ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val;
|
||||
read_halted_new_type_stacks_append(ctx, ssid);
|
||||
ctx.try_to_continue_scheduled();
|
||||
}
|
||||
}
|
||||
|
||||
void i_READ(REGEX_IS024_CONTEXT &ctx) {
|
||||
ctx_print_debug(ctx);
|
||||
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ)
|
||||
regex_sslot_id_t ssid = ctx.extract_sslot_id();
|
||||
do_i_read(ctx, ssid);
|
||||
}
|
||||
|
||||
void i_READZ(REGEX_IS024_CONTEXT &ctx) {
|
||||
ctx_print_debug(ctx);
|
||||
do_i_read(ctx, 0);
|
||||
}
|
||||
|
||||
void i_JUMP(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ)
|
||||
ctx.active_thread.IP = ctx.extract_near_pointer();
|
||||
}
|
||||
|
||||
template<typename conditionT, typename immArgSzT>
|
||||
void i_JC(REGEX_IS024_CONTEXT& ctx)
|
||||
{
|
||||
ctx_print_debug(ctx);
|
||||
check_available_prg(immArgSzT::byte_sz + REGEX024_BYTECODE_NEAR_POINTER_SZ);
|
||||
uint64_t imm_val_B = immArgSzT::extract(ctx);
|
||||
regex_near_ptr_t dest = ctx.extract_near_pointer();
|
||||
uint64_t imm_val_A = ctx.INP;
|
||||
if (conditionT::call(imm_val_A, imm_val_B))
|
||||
ctx.active_thread.IP = dest;
|
||||
}
|
||||
|
||||
struct condEqual{static bool call(uint64_t A, uint64_t B){return A == B;}};
|
||||
struct condLess{static bool call(uint64_t A, uint64_t B){return A < B;}};
|
||||
struct condGrtr{static bool call(uint64_t A, uint64_t B){return A > B;}};
|
||||
|
||||
struct immArgByte{
|
||||
static constexpr int byte_sz = 1;
|
||||
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_b();}
|
||||
};
|
||||
struct immArgWord{
|
||||
static constexpr int byte_sz = 2;
|
||||
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_w();}
|
||||
};
|
||||
struct immArgDoubleWord{
|
||||
static constexpr int byte_sz = 4;
|
||||
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_dw();}
|
||||
};
|
||||
struct immArgQuadWord{
|
||||
static constexpr int byte_sz = 8;
|
||||
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_qw();}
|
||||
};
|
||||
|
||||
void clone_thread_into_slot(REGEX_IS024_Thread& source, REGEX_IS024_Thread& vessel){
|
||||
thread_print_debug(source);
|
||||
my_assert(!(vessel.slot_occupation_status & SLOT_OCCUPIED));
|
||||
my_assert((source.slot_occupation_status & SLOT_OCCUPIED));
|
||||
vessel = source;
|
||||
if (vessel.CAHptr){
|
||||
vessel.CAHptr->refs++;
|
||||
}
|
||||
if (vessel.SAptr){
|
||||
vessel.SAptr[0]++;
|
||||
}
|
||||
}
|
||||
|
||||
/* One FORK-slot governs the one single unique position in program: the next one after the fork */
|
||||
void i_FORK(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ + REGEX024_BYTECODE_NEAR_POINTER_SZ);
|
||||
regex_sslot_id_t ssid = ctx.extract_sslot_id();
|
||||
regex_near_ptr_t dest = ctx.extract_near_pointer();
|
||||
if (ssid >= ctx.fork_slots_number)
|
||||
smitsya(fork_sslot_out_of_range);
|
||||
REGEX_IS024_Thread& other = ctx.FORK_halted_slots[ssid];
|
||||
if (other.slot_occupation_status & SLOT_OCCUPIED){
|
||||
start_cloning_conflict(ctx, other, dest);
|
||||
} else {
|
||||
clone_thread_into_slot(ctx.active_thread, other);
|
||||
ctx.active_thread.IP = dest;
|
||||
ctx.FORK_halted_stack.append(ssid);
|
||||
}
|
||||
}
|
||||
|
||||
void i_MATCH(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
if (ctx.matched_thread.slot_occupation_status & SLOT_OCCUPIED){
|
||||
start_cloning_conflict(ctx, ctx.matched_thread, ctx.active_thread.IP);
|
||||
} else {
|
||||
clone_thread_into_slot(ctx.active_thread, ctx.matched_thread);
|
||||
}
|
||||
}
|
||||
|
||||
void i_DIE(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
ctx.active_thread.delete_thread();
|
||||
ctx.try_to_continue_scheduled();
|
||||
}
|
||||
|
||||
void i_PARAM_READ_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ)
|
||||
regex_sslot_id_t read_slots_number = ctx.extract_sslot_id();
|
||||
ctx.read_slots_number = read_slots_number;
|
||||
}
|
||||
|
||||
void i_PARAM_FORK_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ)
|
||||
regex_sslot_id_t fork_slots_number = ctx.extract_sslot_id();
|
||||
ctx.fork_slots_number = fork_slots_number;
|
||||
}
|
||||
|
||||
void i_PARAM_SELARR_LEN(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
|
||||
regex_tai_t selection_array_len = ctx.extract_track_array_index();
|
||||
ctx.selection_array_len = selection_array_len;
|
||||
}
|
||||
|
||||
void i_PARAM_COLSIFTFUNC_SET(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ)
|
||||
regex_near_ptr_t sift_function = ctx.extract_near_pointer();
|
||||
ctx.have_sift_function = true;
|
||||
ctx.sift_function = sift_function;
|
||||
}
|
||||
|
||||
void i_PARAM_COLSIFTFUNC_WIPE(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
ctx.have_sift_function = false;
|
||||
}
|
||||
|
||||
void i_MSG_MULTISTART_ALLOWED(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
check_available_prg(1)
|
||||
ctx.allows_multistart = (bool)ctx.extract_b();
|
||||
}
|
||||
|
||||
void i_MSG_FED_INPUT_EXTENDED(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
check_available_prg(1 + 1 + REGEX024_BYTECODE_SSLOT_ID_SZ)
|
||||
ctx.fed_input_extends_left = ctx.extract_b();
|
||||
ctx.fed_input_extends_right = ctx.extract_b();
|
||||
ctx.portion_of_second_read_halt_ns = ctx.extract_sslot_id();
|
||||
}
|
||||
|
||||
uint64_t get_el_from_selarr(uint64_t* sa, regex_near_ptr_t ind){
|
||||
return sa ? sa[1UL + ind] : 0;
|
||||
}
|
||||
|
||||
void i_DMOV_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
sift_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
|
||||
regex_tai_t i1 = ctx.extract_track_array_index();
|
||||
if (i1 >= ctx.selection_array_len)
|
||||
smitsya(selection_arr_out_of_range);
|
||||
ctx.RAX = get_el_from_selarr(ctx.active_thread.SAptr, i1);
|
||||
ctx.RBX = get_el_from_selarr(ctx.sifting_with->SAptr, i1);
|
||||
}
|
||||
|
||||
uint64_t get_selarr_el_dist(uint64_t* sa, uint16_t start, uint16_t end){
|
||||
uint64_t v_start = get_el_from_selarr(sa, start);
|
||||
uint64_t v_end = get_el_from_selarr(sa, end);
|
||||
return v_end > v_start ? v_end - v_start : 0;
|
||||
}
|
||||
|
||||
void i_DDIST_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
sift_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ * 2)
|
||||
regex_tai_t i_start = ctx.extract_track_array_index();
|
||||
if (i_start >= ctx.selection_array_len)
|
||||
smitsya(selection_arr_out_of_range);
|
||||
regex_tai_t i_end = ctx.extract_track_array_index();
|
||||
if (i_end >= ctx.selection_array_len)
|
||||
smitsya(selection_arr_out_of_range);
|
||||
ctx.RAX = get_selarr_el_dist(ctx.active_thread.SAptr, i_start, i_end);
|
||||
ctx.RBX = get_selarr_el_dist(ctx.sifting_with->SAptr, i_start, i_end);
|
||||
}
|
||||
|
||||
void finish_conflict_homesteader_wins(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
if (ctx.who_started_sift == regex024_opcodes::READ){
|
||||
ctx.active_thread.delete_thread();
|
||||
ctx.try_to_continue_scheduled();
|
||||
} else {
|
||||
/* FORK or MATCH (which will also be shown as FORK) */
|
||||
/* Cloning conflict ends, active_thread jumps to offsprings IP */
|
||||
ctx.active_thread.IP = ctx.child_ret_IP;
|
||||
}
|
||||
ctx.sifting_with = NULL;
|
||||
}
|
||||
|
||||
void finish_conflict_intruder_wins(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
ctx.sifting_with->delete_thread();
|
||||
ctx.active_thread.IP = ctx.intruder_IP;
|
||||
if (ctx.who_started_sift == regex024_opcodes::READ){
|
||||
/* noncloning conflict won by intruder+ */
|
||||
*ctx.sifting_with = ctx.active_thread;
|
||||
ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val;
|
||||
ctx.try_to_continue_scheduled();
|
||||
} else {
|
||||
/* End of cloning conflict (it involved cloning) */
|
||||
clone_thread_into_slot(ctx.active_thread, *ctx.sifting_with);
|
||||
ctx.active_thread.IP = ctx.child_ret_IP;
|
||||
}
|
||||
ctx.sifting_with = NULL;
|
||||
}
|
||||
|
||||
void i_SIFTPRIOR_MIN_RABX(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
sift_mode_check()
|
||||
if (ctx.RAX < ctx.RBX){
|
||||
finish_conflict_intruder_wins(ctx);
|
||||
} else if (ctx.RAX > ctx.RBX){
|
||||
finish_conflict_homesteader_wins(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void i_SIFTPRIOR_MAX_RABX(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
sift_mode_check()
|
||||
if (ctx.RAX > ctx.RBX){
|
||||
finish_conflict_intruder_wins(ctx);
|
||||
} else if (ctx.RAX < ctx.RBX){
|
||||
finish_conflict_homesteader_wins(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
void i_SIFT_DONE(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
sift_mode_check()
|
||||
finish_conflict_homesteader_wins(ctx);
|
||||
}
|
||||
|
||||
/* Can give errors */
|
||||
void ca_branch_new_node(REGEX_IS024_CONTEXT& ctx, regex_tai_t key, uint64_t val){
|
||||
ctx_print_debug(ctx);
|
||||
if (ctx.CAN_total >= ctx.CA_TREE_LIMIT)
|
||||
smitsya(ca_tree_limit_violation);
|
||||
REGEX024_CollectionArrayNode* node = new REGEX024_CollectionArrayNode{key, val, ctx.active_thread.CAHptr, 1};
|
||||
// if (ctx.active_thread.CAHptr)
|
||||
// (ctx.active_thread.CAHptr->refs)++;
|
||||
ctx.active_thread.CAHptr = node;
|
||||
ctx.CAN_total++;
|
||||
}
|
||||
|
||||
void i_MOV_COLARR_IMM(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8)
|
||||
regex_tai_t ca_ind = ctx.extract_track_array_index();
|
||||
uint64_t imm = ctx.extract_qw();
|
||||
ca_branch_new_node(ctx, ca_ind, imm);
|
||||
}
|
||||
|
||||
void i_MOV_COLARR_BTPOS(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
|
||||
regex_tai_t ca_ind = ctx.extract_track_array_index();
|
||||
ca_branch_new_node(ctx, ca_ind, ctx.passed_bytes);
|
||||
}
|
||||
|
||||
/* Can throw error, should be placed at the end. Call ONLY in general matching mode */
|
||||
void edit_selection_array(REGEX_IS024_CONTEXT& ctx, uint64_t key, uint64_t val){
|
||||
ctx_print_debug(ctx);
|
||||
uint64_t N = ctx.selection_array_len;
|
||||
if (key >= N)
|
||||
smitsya(selection_arr_out_of_range);
|
||||
if (!ctx.active_thread.SAptr){
|
||||
uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8);
|
||||
if (!sa_instance)
|
||||
throw std::bad_alloc();
|
||||
sa_instance[0] = 1;
|
||||
sa_instance[key + 1] = val;
|
||||
ctx.active_thread.SAptr = sa_instance;
|
||||
} else if (ctx.active_thread.SAptr[0] == 1){
|
||||
ctx.active_thread.SAptr[key + 1] = val;
|
||||
} else {
|
||||
uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8);
|
||||
if (!sa_instance)
|
||||
throw std::bad_alloc();
|
||||
sa_instance[0] = 1;
|
||||
for (uint64_t i = 1; i <= ctx.selection_array_len; i++)
|
||||
sa_instance[i] = ctx.active_thread.SAptr[i];
|
||||
sa_instance[key + 1] = val;
|
||||
ctx.active_thread.SAptr[0]--;
|
||||
ctx.active_thread.SAptr = sa_instance;
|
||||
}
|
||||
}
|
||||
|
||||
void i_MOV_SELARR_IMM(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8)
|
||||
regex_tai_t sa_ind = ctx.extract_track_array_index();
|
||||
uint64_t imm = ctx.extract_qw();
|
||||
edit_selection_array(ctx, sa_ind, imm);
|
||||
}
|
||||
|
||||
void i_MOV_SELARR_CHPOS(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
general_matching_mode_check()
|
||||
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
|
||||
regex_tai_t sa_ind = ctx.extract_track_array_index();
|
||||
edit_selection_array(ctx, sa_ind, ctx.passed_chars);
|
||||
}
|
||||
|
||||
void calloc_stack_slots(REGEX_IS024_Stack& stack, regex_sslot_id_t nmemb) {
|
||||
assert(stack.sz == 0 && !stack.slots);
|
||||
regex_sslot_id_t* storage = static_cast<regex_sslot_id_t *>(calloc(nmemb, sizeof(regex_sslot_id_t)));
|
||||
if (!storage)
|
||||
throw std::bad_alloc();
|
||||
stack.slots = storage;
|
||||
}
|
||||
|
||||
REGEX_IS024_Thread* calloc_slots_array(regex_sslot_id_t nmemb) {
|
||||
REGEX_IS024_Thread* ptr = static_cast<REGEX_IS024_Thread *>(calloc(nmemb, sizeof(REGEX_IS024_Thread)));
|
||||
if (!ptr)
|
||||
throw std::bad_alloc();
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void i_INIT(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx_print_debug(ctx);
|
||||
initialization_phase_check()
|
||||
if (ctx.selection_array_len > ctx.SA_LEN_LIMIT)
|
||||
smitsya(sa_length_limit_violation);
|
||||
if (ctx.read_slots_number > ctx.READ_SS_LIMIT)
|
||||
smitsya(read_sslot_count_limit_violation);
|
||||
if (ctx.fork_slots_number > ctx.FORK_SS_LIMIT)
|
||||
smitsya(fork_sslot_count_limit_violation);
|
||||
if (ctx.portion_of_second_read_halt_ns > ctx.read_slots_number)
|
||||
smitsya(fork_sslot_out_of_range);
|
||||
ctx.READ_halted_slots = calloc_slots_array(ctx.read_slots_number);
|
||||
calloc_stack_slots(ctx.READ_halted_stack_old, ctx.read_slots_number);
|
||||
|
||||
ctx.portion_of_FIRST_read_halt_ns = ctx.read_slots_number - ctx.portion_of_second_read_halt_ns;
|
||||
calloc_stack_slots(ctx.READ_halted_stack_new_first, ctx.portion_of_FIRST_read_halt_ns);
|
||||
calloc_stack_slots(ctx.READ_halted_stack_new_second, ctx.portion_of_second_read_halt_ns);
|
||||
|
||||
ctx.FORK_halted_slots = calloc_slots_array(ctx.fork_slots_number);
|
||||
calloc_stack_slots(ctx.FORK_halted_stack, ctx.fork_slots_number);
|
||||
|
||||
ctx.initialized = true;
|
||||
ctx.unnatural_started_thread_IP = ctx.active_thread.IP;
|
||||
ctx.active_thread.delete_thread();
|
||||
}
|
||||
|
||||
void i_THROW(REGEX_IS024_CONTEXT& ctx){
|
||||
ctx.error = regex024_error_codes::program_throw;
|
||||
}
|
||||
|
||||
void instruction_table(REGEX_IS024_CONTEXT &ctx) {
|
||||
ctx_print_debug(ctx);
|
||||
uint8_t opcode = ctx.extract_instruction();
|
||||
|
||||
#define rcase(inst) case regex024_opcodes::inst: return i_ ## inst (ctx);
|
||||
#define jumpC(UN, st) case regex024_opcodes::JC ## UN ## _B: return i_JC<st, immArgByte>(ctx); \
|
||||
case regex024_opcodes::JC ## UN ## _W: return i_JC<st, immArgWord>(ctx); \
|
||||
case regex024_opcodes::JC ## UN ## _DW: return i_JC<st, immArgDoubleWord>(ctx); \
|
||||
case regex024_opcodes::JC ## UN ## _QW: return i_JC<st, immArgQuadWord>(ctx);
|
||||
switch (opcode) {
|
||||
rcase(READ)
|
||||
rcase(READZ)
|
||||
rcase(JUMP)
|
||||
|
||||
jumpC(EQUAL, condEqual)
|
||||
jumpC(LESS, condLess)
|
||||
jumpC(GRTR, condGrtr)
|
||||
|
||||
rcase(FORK)
|
||||
rcase(MATCH)
|
||||
rcase(DIE)
|
||||
rcase(PARAM_READ_SS_NUMBER)
|
||||
rcase(PARAM_FORK_SS_NUMBER)
|
||||
rcase(PARAM_SELARR_LEN)
|
||||
rcase(PARAM_COLSIFTFUNC_SET)
|
||||
rcase(PARAM_COLSIFTFUNC_WIPE)
|
||||
rcase(MSG_MULTISTART_ALLOWED)
|
||||
rcase(MSG_FED_INPUT_EXTENDED)
|
||||
rcase(DMOV_RABX_SELARR)
|
||||
rcase(DDIST_RABX_SELARR)
|
||||
rcase(SIFTPRIOR_MIN_RABX)
|
||||
rcase(SIFTPRIOR_MAX_RABX)
|
||||
rcase(SIFT_DONE)
|
||||
rcase(MOV_COLARR_IMM)
|
||||
rcase(MOV_COLARR_BTPOS)
|
||||
rcase(MOV_SELARR_IMM)
|
||||
rcase(MOV_SELARR_CHPOS)
|
||||
rcase(INIT)
|
||||
rcase(THROW)
|
||||
default:
|
||||
ctx.error = regex024_error_codes::invalid_opcode;
|
||||
}
|
||||
}
|
35
src/libregexis024vm/instruction_implementation.h
Normal file
35
src/libregexis024vm/instruction_implementation.h
Normal file
@ -0,0 +1,35 @@
|
||||
#ifndef LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H
|
||||
#define LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H
|
||||
|
||||
/* This file should not be included outside libregex024 virtual machine implementation */
|
||||
|
||||
#include <libregexis024vm/libregexis024vm.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define smitsya(error_type) do {ctx.error = regex024_error_codes::error_type; return; } while (0)
|
||||
|
||||
#define SLOT_EMPTY_val 0
|
||||
#define SLOT_OCCUPIED 1
|
||||
#define SLOT_OCCUPIED_val SLOT_OCCUPIED
|
||||
#define SLOT_NEW 2
|
||||
#define SLOT_NEW_val (SLOT_OCCUPIED | SLOT_NEW)
|
||||
|
||||
#define check_available_prg(regionSz) if (!ctx.check_inboundness(regionSz)){ \
|
||||
ctx.error = regex024_error_codes::improper_finish; return; }
|
||||
|
||||
|
||||
#if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD)
|
||||
#include <debugging_regexis024/vm/libregexis024vm_debug.h>
|
||||
#define my_assert(expr) assert(expr)
|
||||
#define ctx_print_debug(ctx) debug_print_context(ctx, __func__)
|
||||
#define thread_print_debug(thread) debug_print_thread(thread, __func__)
|
||||
#else
|
||||
#define my_assert(expr) assert(expr)
|
||||
#define ctx_print_debug(ctx)
|
||||
#define thread_print_debug(thread)
|
||||
#endif
|
||||
|
||||
void instruction_table(REGEX_IS024_CONTEXT& ctx);
|
||||
|
||||
#endif //LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H
|
47
src/libregexis024vm/libregex024opcodes_stringification.cpp
Normal file
47
src/libregexis024vm/libregex024opcodes_stringification.cpp
Normal file
@ -0,0 +1,47 @@
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
|
||||
#define rcase(name) case regex024_opcodes::name: return #name;
|
||||
|
||||
const char *regex024_opcode_tostr(regex024_opcode x) {
|
||||
switch (x) {
|
||||
rcase(READ)
|
||||
rcase(READZ)
|
||||
rcase(JUMP)
|
||||
rcase(JCEQUAL_B)
|
||||
rcase(JCEQUAL_W)
|
||||
rcase(JCEQUAL_DW)
|
||||
rcase(JCEQUAL_QW)
|
||||
rcase(JCLESS_B)
|
||||
rcase(JCLESS_W)
|
||||
rcase(JCLESS_DW)
|
||||
rcase(JCLESS_QW)
|
||||
rcase(JCGRTR_B)
|
||||
rcase(JCGRTR_W)
|
||||
rcase(JCGRTR_DW)
|
||||
rcase(JCGRTR_QW)
|
||||
rcase(FORK)
|
||||
rcase(MATCH)
|
||||
rcase(DIE)
|
||||
rcase(PARAM_READ_SS_NUMBER)
|
||||
rcase(PARAM_FORK_SS_NUMBER)
|
||||
rcase(PARAM_SELARR_LEN)
|
||||
rcase(PARAM_COLSIFTFUNC_SET)
|
||||
rcase(PARAM_COLSIFTFUNC_WIPE)
|
||||
rcase(MSG_MULTISTART_ALLOWED)
|
||||
rcase(MSG_FED_INPUT_EXTENDED)
|
||||
rcase(DMOV_RABX_SELARR)
|
||||
rcase(DDIST_RABX_SELARR)
|
||||
rcase(SIFTPRIOR_MIN_RABX)
|
||||
rcase(SIFTPRIOR_MAX_RABX)
|
||||
rcase(SIFT_DONE)
|
||||
rcase(MOV_COLARR_IMM)
|
||||
rcase(MOV_COLARR_BTPOS)
|
||||
rcase(MOV_SELARR_IMM)
|
||||
rcase(MOV_SELARR_CHPOS)
|
||||
rcase(INIT)
|
||||
rcase(THROW)
|
||||
default:
|
||||
return "Invalid opcode";
|
||||
}
|
||||
}
|
158
src/libregexis024vm/libregexis024vm.h
Normal file
158
src/libregexis024vm/libregexis024vm.h
Normal file
@ -0,0 +1,158 @@
|
||||
#ifndef LIBREGEXIS024_LIBREGEXIS024VM_H
|
||||
#define LIBREGEXIS024_LIBREGEXIS024VM_H
|
||||
|
||||
/* This thing is bloated. And slow (Because I designed it imperfectly and because it is bloated).
|
||||
* I could have halven the amount of bloat, but that would require me writing code in headers.
|
||||
* I am gonna use it for KM, even more bloated project. So I thought that this design is on the spot.
|
||||
* C++ is such a funny language. Code is divided into .cpp and .h files. But it only makes problems.
|
||||
* All of my work on this C++ project was not serious from the beginning. It's all funny stuff. */
|
||||
|
||||
/* Also, please, consider using libregexis024vm/libregexis024vm_interface.h
|
||||
* Naming in this project is super inconsistent. I don't want it to trash your namespace */
|
||||
|
||||
#include <libregexis024vm/vm_errno.h>
|
||||
#include <libregexis024vm/utils.h>
|
||||
#include <libregexis024vm/vm_opcodes_types.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
struct REGEX_IS024_Stack{
|
||||
regex_sslot_id_t* slots = NULL;
|
||||
regex_sslot_id_t sz = 0;
|
||||
|
||||
regex_sslot_id_t pop();
|
||||
void append(regex_sslot_id_t x);
|
||||
bool empty() const;
|
||||
bool non_empty() const;
|
||||
|
||||
REGEX_IS024_Stack(const REGEX_IS024_Stack&) = delete;
|
||||
REGEX_IS024_Stack& operator=(const REGEX_IS024_Stack&) = delete;
|
||||
REGEX_IS024_Stack() = default;
|
||||
|
||||
~REGEX_IS024_Stack();
|
||||
};
|
||||
|
||||
struct REGEX024_CollectionArrayNode{
|
||||
/* Key is small for historical reasons I do not rememeber. Who cares anyway */
|
||||
regex_tai_t key;
|
||||
uint64_t value;
|
||||
/* NULL at the beginning */
|
||||
REGEX024_CollectionArrayNode* prev;
|
||||
/* Reference counting */
|
||||
uint64_t refs = 0;
|
||||
};
|
||||
|
||||
struct REGEX_IS024_Thread{
|
||||
/* First byte field is used only when thread is located in slot */
|
||||
uint8_t slot_occupation_status = 0;
|
||||
regex_near_ptr_t IP = 0;
|
||||
REGEX024_CollectionArrayNode* CAHptr = NULL;
|
||||
/* Pointer to the seletion array. SA's are reference counted. Because of that every SA
|
||||
* is elongated by one meta element in the beginning - reference counter. So the actual elements
|
||||
* are enumerated starting from one. */
|
||||
uint64_t* SAptr = NULL;
|
||||
|
||||
void delete_thread() noexcept;
|
||||
void debug_print(const char* place);
|
||||
};
|
||||
|
||||
struct REGEX_IS024_CONTEXT{
|
||||
REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, regex_tai_t saLenLimit,
|
||||
regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, uint64_t timeTickLimit);
|
||||
|
||||
regex024_error_code feedSOF();
|
||||
/* You can safely pile up calls to this command, nothing bad will happen */
|
||||
regex024_error_code startThread();
|
||||
regex024_error_code extendedFeedCharacter(uint64_t input);
|
||||
regex024_error_code feedCharacter(uint64_t INP, uint64_t corresponding_byte_amount);
|
||||
|
||||
|
||||
~REGEX_IS024_CONTEXT();
|
||||
|
||||
/* Program size larger than 2^62 is forbidden */
|
||||
size_t program_size = 0;
|
||||
const uint8_t* prg = NULL;
|
||||
|
||||
/* Max allowed index of CA is 2^16 - 1
|
||||
* Max allowed index of SA is 2^16 - 1. VM can be configured to allow even less */
|
||||
/* CA = Collecton array. */
|
||||
uint64_t CA_TREE_LIMIT;
|
||||
/* SA = Selection array */
|
||||
regex_tai_t SA_LEN_LIMIT;
|
||||
regex_sslot_id_t READ_SS_LIMIT;
|
||||
regex_sslot_id_t FORK_SS_LIMIT;
|
||||
|
||||
/* If time_tick_limit is non-zero, regex virtual machine will stop with error
|
||||
* after this many ticks. This parameter set's the timeout.*/
|
||||
uint64_t time_tick_limit;
|
||||
|
||||
/* This context is used only for one FA match session. This field measures each tick
|
||||
* timer <= time_tick_limit */
|
||||
uint64_t timer = 0;
|
||||
/* CAN_total <= CA_TREE_LIMIT */
|
||||
uint64_t CAN_total = 0;
|
||||
|
||||
/* Program selects it */
|
||||
regex_tai_t selection_array_len = 0;
|
||||
regex_sslot_id_t read_slots_number = 0;
|
||||
regex_sslot_id_t fork_slots_number = 0;
|
||||
|
||||
bool have_sift_function = false;
|
||||
regex_near_ptr_t sift_function;
|
||||
|
||||
bool allows_multistart = false;
|
||||
uint8_t fed_input_extends_left = 0, fed_input_extends_right = 0;
|
||||
regex_sslot_id_t portion_of_second_read_halt_ns = 0, portion_of_FIRST_read_halt_ns = 0;
|
||||
|
||||
bool initialized = false;
|
||||
regex_near_ptr_t unnatural_started_thread_IP = 1337;
|
||||
regex024_error_code error = regex024_error_codes::stable;
|
||||
|
||||
REGEX_IS024_Thread* READ_halted_slots;
|
||||
REGEX_IS024_Stack READ_halted_stack_old;
|
||||
REGEX_IS024_Stack READ_halted_stack_new_first;
|
||||
REGEX_IS024_Stack READ_halted_stack_new_second;
|
||||
REGEX_IS024_Thread* FORK_halted_slots;
|
||||
REGEX_IS024_Stack FORK_halted_stack;
|
||||
|
||||
REGEX_IS024_Thread active_thread;
|
||||
|
||||
/* Environment for sifting stuff */
|
||||
REGEX_IS024_Thread* sifting_with = NULL;
|
||||
/* specifies the type of operation vm should do after shift (there are only two distinct options) */
|
||||
uint8_t who_started_sift;
|
||||
/* Sifting process uses IP field of active thread. Other data of thread is not modified or used during collision
|
||||
* procudure. Old IP is stored there, if needed */
|
||||
regex_near_ptr_t child_ret_IP;
|
||||
regex_near_ptr_t intruder_IP;
|
||||
/* RAX corresponds to intruder. Its data is stored in active thread field*/
|
||||
uint64_t RAX;
|
||||
/* RBX corresponds to homesteader. Its data is accessible by `REGEX_IS024_Thread* sifting_with` pointer*/
|
||||
uint64_t RBX;
|
||||
|
||||
/* Will be unoccupied if no threads matched. After each feed of character this field will be wiped
|
||||
* User should take care of intermediate success himself */
|
||||
REGEX_IS024_Thread matched_thread;
|
||||
|
||||
uint64_t INP = 0;
|
||||
uint64_t passed_chars = 0;
|
||||
uint64_t passed_bytes = 0;
|
||||
|
||||
void try_to_continue_scheduled();
|
||||
|
||||
bool check_inboundness(int region);
|
||||
|
||||
uint8_t extract_b();
|
||||
uint16_t extract_w();
|
||||
uint32_t extract_dw();
|
||||
uint64_t extract_qw();
|
||||
|
||||
uint8_t extract_instruction();
|
||||
regex_sslot_id_t extract_sslot_id();
|
||||
regex_near_ptr_t extract_near_pointer();
|
||||
regex_tai_t extract_track_array_index();
|
||||
|
||||
void debug_print(const char* place);
|
||||
};
|
||||
|
||||
#endif //LIBREGEXIS024_LIBREGEXIS024VM_H
|
197
src/libregexis024vm/libregexis024vm_context.cpp
Normal file
197
src/libregexis024vm/libregexis024vm_context.cpp
Normal file
@ -0,0 +1,197 @@
|
||||
#include <libregexis024vm/libregexis024vm.h>
|
||||
#include <libregexis024vm/instruction_implementation.h>
|
||||
#include <utility>
|
||||
|
||||
regex_sslot_id_t REGEX_IS024_Stack::pop() {
|
||||
assert(sz != 0);
|
||||
return slots[--sz];
|
||||
}
|
||||
|
||||
void REGEX_IS024_Stack::append(regex_sslot_id_t x) {
|
||||
assert(slots);
|
||||
slots[sz] = x;
|
||||
sz++;
|
||||
}
|
||||
|
||||
bool REGEX_IS024_Stack::empty() const {
|
||||
return !non_empty();
|
||||
}
|
||||
|
||||
bool REGEX_IS024_Stack::non_empty() const {
|
||||
return sz;
|
||||
}
|
||||
|
||||
REGEX_IS024_Stack::~REGEX_IS024_Stack() {
|
||||
assert(empty());
|
||||
free(slots);
|
||||
}
|
||||
|
||||
REGEX_IS024_CONTEXT::REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data,
|
||||
uint64_t caTreeLimit, regex_tai_t saLenLimit,
|
||||
regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit,
|
||||
uint64_t timeTickLimit) :
|
||||
program_size(programSize), prg(data), CA_TREE_LIMIT(caTreeLimit), SA_LEN_LIMIT(saLenLimit),
|
||||
READ_SS_LIMIT(readSsLimit), FORK_SS_LIMIT(forkSsLimit), time_tick_limit(timeTickLimit)
|
||||
{
|
||||
if (program_size > (1UL << 62))
|
||||
exitf("Program is too huge\n");
|
||||
active_thread.slot_occupation_status = SLOT_OCCUPIED;
|
||||
}
|
||||
|
||||
/* No only will it launch a wave of deallocation in CA tree, but as a nice bonus it's
|
||||
* gonna deoccupy slot_occupation_status*/
|
||||
void REGEX_IS024_Thread::delete_thread() noexcept {
|
||||
thread_print_debug(*this);
|
||||
my_assert(slot_occupation_status & SLOT_OCCUPIED);
|
||||
slot_occupation_status = SLOT_EMPTY_val;
|
||||
REGEX024_CollectionArrayNode* cur_CAptr = CAHptr;
|
||||
while (cur_CAptr){
|
||||
assert(cur_CAptr->refs > 0);
|
||||
if (--(cur_CAptr->refs) == 0){
|
||||
REGEX024_CollectionArrayNode* next_CAptr = cur_CAptr->prev;
|
||||
delete cur_CAptr;
|
||||
cur_CAptr = next_CAptr;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
if (SAptr){
|
||||
if (--(SAptr[0]) == 0)
|
||||
free(SAptr);
|
||||
}
|
||||
}
|
||||
|
||||
void emptify_one_of_new_read_halted_stacks(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& type_new_stack){
|
||||
while (type_new_stack.non_empty()){
|
||||
REGEX_IS024_Thread& thread = ctx.READ_halted_slots[type_new_stack.pop()];
|
||||
assert(thread.slot_occupation_status & SLOT_OCCUPIED);
|
||||
thread.delete_thread();
|
||||
}
|
||||
}
|
||||
|
||||
/* First it will try to pop pending thread from FORK_halted_stack
|
||||
* Then it will try popping thread from READ_halted_stack_old (checking if top
|
||||
* thread here is not actually SLOT_NEW). If something succeded, corresponding slot will be deoccupied, and
|
||||
* active slot will be occupied with it.
|
||||
*
|
||||
* try_to_continue_scheduled() assumes that active thread is unoccupied.*/
|
||||
void REGEX_IS024_CONTEXT::try_to_continue_scheduled(){
|
||||
ctx_print_debug(*this);
|
||||
my_assert(!(active_thread.slot_occupation_status & SLOT_OCCUPIED));
|
||||
if (FORK_halted_stack.sz){
|
||||
regex_sslot_id_t ssid = FORK_halted_stack.pop();
|
||||
active_thread = FORK_halted_slots[ssid];
|
||||
FORK_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val;
|
||||
return;
|
||||
}
|
||||
while (READ_halted_stack_old.sz){
|
||||
regex_sslot_id_t ssid = READ_halted_stack_old.pop();
|
||||
if (READ_halted_slots[ssid].slot_occupation_status & SLOT_NEW){
|
||||
/* This is the case when old thread was silently replaced by settled new thread */
|
||||
continue;
|
||||
}
|
||||
active_thread = READ_halted_slots[ssid];
|
||||
READ_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val;
|
||||
return;
|
||||
}
|
||||
/* Failure here will be detected. We started with unoccupied active thread. iterator inside kick will see it */
|
||||
}
|
||||
|
||||
void kick(REGEX_IS024_CONTEXT& ctx) {
|
||||
ctx_print_debug(ctx);
|
||||
while ((ctx.active_thread.slot_occupation_status & SLOT_OCCUPIED)
|
||||
&& ctx.error == regex024_error_codes::stable){
|
||||
if (ctx.timer >= ctx.time_tick_limit)
|
||||
smitsya(timeout);
|
||||
ctx.timer++;
|
||||
|
||||
check_available_prg(REGEX024_BYTECODE_INSTRUCTION_SZ) // May return from kick(ctx)
|
||||
// smivanie from those instructions will be immediately detected. Everything is OK
|
||||
instruction_table(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
regex024_error_code REGEX_IS024_CONTEXT::feedSOF() {
|
||||
ctx_print_debug(*this);
|
||||
kick(*this);
|
||||
return error;
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_CONTEXT::startThread() {
|
||||
ctx_print_debug(*this);
|
||||
active_thread.slot_occupation_status = SLOT_OCCUPIED;
|
||||
active_thread.IP = unnatural_started_thread_IP;
|
||||
active_thread.SAptr = NULL;
|
||||
active_thread.CAHptr = NULL;
|
||||
kick(*this);
|
||||
return error;
|
||||
}
|
||||
|
||||
/* I hate C++ (aka antichrist), won't use move sementic (aka drink cornsyrup) */
|
||||
void swap_stacks(REGEX_IS024_Stack& A, REGEX_IS024_Stack& B) {
|
||||
std::swap(A.sz, B.sz);
|
||||
std::swap(A.slots, B.slots);
|
||||
}
|
||||
|
||||
void fill_empty_old_read_halted_stack(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& read_halted_stack_new){
|
||||
ctx_print_debug(ctx);
|
||||
my_assert(!ctx.READ_halted_stack_old.non_empty());
|
||||
|
||||
// Actually, READ_halted_stack_old is always empty in this case
|
||||
assert(ctx.READ_halted_stack_old.empty());
|
||||
swap_stacks(ctx.READ_halted_stack_old, read_halted_stack_new);
|
||||
for (uint32_t i = 0; i < ctx.READ_halted_stack_old.sz; i++){
|
||||
REGEX_IS024_Thread& slot = ctx.READ_halted_slots[ctx.READ_halted_stack_old.slots[i]];
|
||||
/* Should get rid of 'NEW' qualifier */
|
||||
assert(slot.slot_occupation_status & SLOT_OCCUPIED);
|
||||
if (slot.slot_occupation_status & SLOT_OCCUPIED)
|
||||
slot.slot_occupation_status = SLOT_OCCUPIED;
|
||||
}
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_CONTEXT::feedCharacter(uint64_t input, uint64_t corresponding_byte_amount) {
|
||||
ctx_print_debug(*this);
|
||||
if (matched_thread.slot_occupation_status & SLOT_OCCUPIED)
|
||||
matched_thread.delete_thread();
|
||||
emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second);
|
||||
fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_first);
|
||||
INP = input;
|
||||
passed_bytes += corresponding_byte_amount;
|
||||
passed_chars++;
|
||||
try_to_continue_scheduled();
|
||||
kick(*this);
|
||||
return error;
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_CONTEXT::extendedFeedCharacter(uint64_t input) {
|
||||
ctx_print_debug(*this);
|
||||
if (matched_thread.slot_occupation_status & SLOT_OCCUPIED)
|
||||
matched_thread.delete_thread();
|
||||
fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_second);
|
||||
INP = input;
|
||||
try_to_continue_scheduled();
|
||||
kick(*this);
|
||||
return error;
|
||||
}
|
||||
|
||||
REGEX_IS024_CONTEXT::~REGEX_IS024_CONTEXT() {
|
||||
ctx_print_debug(*this);
|
||||
if (initialized){
|
||||
emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_first);
|
||||
emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second);
|
||||
while (READ_halted_stack_old.non_empty()){
|
||||
REGEX_IS024_Thread& thread = READ_halted_slots[READ_halted_stack_old.pop()];
|
||||
assert(thread.slot_occupation_status & SLOT_OCCUPIED);
|
||||
if (!(thread.slot_occupation_status & SLOT_NEW))
|
||||
thread.delete_thread();
|
||||
}
|
||||
free(READ_halted_slots);
|
||||
while (FORK_halted_stack.non_empty())
|
||||
FORK_halted_slots[FORK_halted_stack.pop()].delete_thread();
|
||||
free(FORK_halted_slots);
|
||||
|
||||
if (matched_thread.slot_occupation_status & SLOT_OCCUPIED){
|
||||
matched_thread.delete_thread();
|
||||
}
|
||||
}
|
||||
}
|
38
src/libregexis024vm/libregexis024vm_disassembly.cpp
Normal file
38
src/libregexis024vm/libregexis024vm_disassembly.cpp
Normal file
@ -0,0 +1,38 @@
|
||||
#include <libregexis024vm/libregexis024vm.h>
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
|
||||
bool REGEX_IS024_CONTEXT::check_inboundness(int region){
|
||||
return vmprog_check_inboundness(program_size, active_thread.IP, region);
|
||||
}
|
||||
|
||||
uint8_t REGEX_IS024_CONTEXT::extract_b() {
|
||||
return vmprog_extract_b(&active_thread.IP, prg);
|
||||
}
|
||||
|
||||
uint16_t REGEX_IS024_CONTEXT::extract_w() {
|
||||
return vmprog_extract_w(&active_thread.IP, prg);
|
||||
}
|
||||
|
||||
uint32_t REGEX_IS024_CONTEXT::extract_dw() {
|
||||
return vmprog_extract_dw(&active_thread.IP, prg);
|
||||
}
|
||||
|
||||
uint64_t REGEX_IS024_CONTEXT::extract_qw() {
|
||||
return vmprog_extract_qw(&active_thread.IP, prg);
|
||||
}
|
||||
|
||||
uint8_t REGEX_IS024_CONTEXT::extract_instruction() {
|
||||
return extract_b();
|
||||
}
|
||||
|
||||
regex_sslot_id_t REGEX_IS024_CONTEXT::extract_sslot_id() {
|
||||
return extract_dw();
|
||||
}
|
||||
|
||||
regex_near_ptr_t REGEX_IS024_CONTEXT::extract_near_pointer() {
|
||||
return extract_qw();
|
||||
}
|
||||
|
||||
regex_tai_t REGEX_IS024_CONTEXT::extract_track_array_index() {
|
||||
return extract_w();
|
||||
}
|
105
src/libregexis024vm/libregexis024vm_interface.cpp
Normal file
105
src/libregexis024vm/libregexis024vm_interface.cpp
Normal file
@ -0,0 +1,105 @@
|
||||
#include <libregexis024vm/libregexis024vm_interface.h>
|
||||
#include <libregexis024vm/libregexis024vm.h>
|
||||
#include <libregexis024vm/instruction_implementation.h>
|
||||
|
||||
bool REGEX_IS024_CAEvent::operator==(const REGEX_IS024_CAEvent &other) const {
|
||||
return (key == other.key) && (value == other.value);
|
||||
}
|
||||
|
||||
#define reveal ((REGEX_IS024_CONTEXT*)opaque)
|
||||
|
||||
REGEX_IS024_VirtualMachine::REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data,
|
||||
uint64_t caTreeLimit, regex_tai_t saLenLimit,
|
||||
regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit,
|
||||
uint64_t timeTickLimit) {
|
||||
opaque = new REGEX_IS024_CONTEXT(programSize, data, caTreeLimit, saLenLimit,
|
||||
readSsLimit, forkSsLimit, timeTickLimit);
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_VirtualMachine::initialize() {
|
||||
if (gave_SOF)
|
||||
exitf("double feedSOF\n");
|
||||
gave_SOF = true;
|
||||
return reveal->feedSOF();
|
||||
}
|
||||
|
||||
bool REGEX_IS024_VirtualMachine::isInitialized() {
|
||||
return reveal->initialized;
|
||||
}
|
||||
|
||||
bool REGEX_IS024_VirtualMachine::isUsable() {
|
||||
return isInitialized() && reveal->error == regex024_error_codes::stable;
|
||||
}
|
||||
|
||||
REGEX_IS024_VirtualMachine::~REGEX_IS024_VirtualMachine() {
|
||||
delete reveal;
|
||||
}
|
||||
|
||||
regex_tai_t REGEX_IS024_VirtualMachine::getSelectionArrayLength() {
|
||||
return isUsable() ? reveal->selection_array_len : 0;
|
||||
}
|
||||
|
||||
bool REGEX_IS024_VirtualMachine::isAllowMultistart() {
|
||||
return isUsable() ? reveal->allows_multistart : false;
|
||||
}
|
||||
|
||||
uint8_t REGEX_IS024_VirtualMachine::getInputLeftExtensionSize() {
|
||||
return isUsable() ? reveal->fed_input_extends_left : 0;
|
||||
}
|
||||
|
||||
uint8_t REGEX_IS024_VirtualMachine::getInputRightExtensionSize() {
|
||||
return isUsable() ? reveal->fed_input_extends_right : 0;
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_VirtualMachine::getErrno() {
|
||||
return reveal->error;
|
||||
}
|
||||
|
||||
/* Stupid kinda function. Checks if somebody is ready to continue reading the actual string */
|
||||
bool REGEX_IS024_VirtualMachine::haveSurvivors() {
|
||||
return isUsable() && (reveal->READ_halted_stack_new_first.non_empty());
|
||||
}
|
||||
|
||||
bool REGEX_IS024_VirtualMachine::isMatched() {
|
||||
return isUsable() && static_cast<bool>((reveal->matched_thread.slot_occupation_status & SLOT_OCCUPIED));
|
||||
}
|
||||
|
||||
std::vector<REGEX_IS024_CAEvent> REGEX_IS024_VirtualMachine::getMatchedThreadCABranchReverse() {
|
||||
if (!isMatched())
|
||||
return {};
|
||||
std::vector<REGEX_IS024_CAEvent> res;
|
||||
REGEX024_CollectionArrayNode* cur = reveal->matched_thread.CAHptr;
|
||||
while (cur != NULL){
|
||||
res.push_back({cur->key, cur->value});
|
||||
cur = cur->prev;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
uint64_t REGEX_IS024_VirtualMachine::getMatchedThreadSAValue(uint16_t key) {
|
||||
if (key >= getSelectionArrayLength())
|
||||
return 0;
|
||||
if (!isMatched())
|
||||
return 0;
|
||||
return reveal->matched_thread.SAptr ? reveal->matched_thread.SAptr[key + 1] : 0;
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_VirtualMachine::addNewMatchingThread() {
|
||||
if (!isUsable())
|
||||
exitf("unusable\n");
|
||||
// if (started_first_thread && !isAllowMultistart())
|
||||
// exitf("Multistart is forbidden, bad usage of program\n");
|
||||
return reveal->startThread();
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_VirtualMachine::extendedFeedCharacter(uint64_t input) {
|
||||
if (!isUsable())
|
||||
exitf("unusable\n");
|
||||
return reveal->extendedFeedCharacter(input);
|
||||
}
|
||||
|
||||
regex024_error_code REGEX_IS024_VirtualMachine::feedCharacter(uint64_t input, uint64_t bytesResembled) {
|
||||
if (!isUsable())
|
||||
exitf("unusable\n");
|
||||
return reveal->feedCharacter(input, bytesResembled);
|
||||
}
|
46
src/libregexis024vm/libregexis024vm_interface.h
Normal file
46
src/libregexis024vm/libregexis024vm_interface.h
Normal file
@ -0,0 +1,46 @@
|
||||
#ifndef LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H
|
||||
#define LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <vector>
|
||||
#include <libregexis024vm/vm_errno.h>
|
||||
#include <libregexis024vm/vm_opcodes_types.h>
|
||||
|
||||
struct REGEX_IS024_CAEvent{
|
||||
regex_tai_t key;
|
||||
uint64_t value;
|
||||
bool operator==(const REGEX_IS024_CAEvent& other) const;
|
||||
};
|
||||
|
||||
class REGEX_IS024_VirtualMachine{
|
||||
public:
|
||||
REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, uint16_t saLenLimit,
|
||||
uint32_t readSsLimit, uint32_t forkSsLimit, uint64_t timeTickLimit);
|
||||
|
||||
REGEX_IS024_VirtualMachine(const REGEX_IS024_VirtualMachine& ) = delete;
|
||||
REGEX_IS024_VirtualMachine& operator=(const REGEX_IS024_VirtualMachine&) = delete;
|
||||
|
||||
regex024_error_code initialize();
|
||||
bool isInitialized();
|
||||
bool isUsable();
|
||||
virtual ~REGEX_IS024_VirtualMachine();
|
||||
regex_tai_t getSelectionArrayLength();
|
||||
bool isAllowMultistart();
|
||||
uint8_t getInputLeftExtensionSize();
|
||||
uint8_t getInputRightExtensionSize();
|
||||
regex024_error_code getErrno();
|
||||
bool haveSurvivors();
|
||||
bool isMatched();
|
||||
std::vector<REGEX_IS024_CAEvent> getMatchedThreadCABranchReverse();
|
||||
uint64_t getMatchedThreadSAValue(uint16_t key);
|
||||
|
||||
regex024_error_code addNewMatchingThread();
|
||||
regex024_error_code extendedFeedCharacter(uint64_t input);
|
||||
regex024_error_code feedCharacter(uint64_t input, uint64_t bytesResembled);
|
||||
|
||||
private:
|
||||
bool gave_SOF = false;
|
||||
void* opaque;
|
||||
};
|
||||
|
||||
#endif //LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H
|
69
src/libregexis024vm/utils.cpp
Normal file
69
src/libregexis024vm/utils.cpp
Normal file
@ -0,0 +1,69 @@
|
||||
#include <libregexis024vm/utils.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#ifndef __ORDER_LITTLE_ENDIAN__
|
||||
#error "Big endian is currently unsupported"
|
||||
#endif
|
||||
|
||||
void exitf(const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
vfprintf(stderr, fmt, va);
|
||||
va_end(va);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int utf8_retrieve_size(uint8_t firstByte) {
|
||||
if (!(firstByte & 0b10000000))
|
||||
return 1;
|
||||
uint8_t a = 0b11000000;
|
||||
uint8_t b = 0b00100000;
|
||||
for (int i = 2; i <= 4; i++){
|
||||
if ((firstByte & (a | b)) == a)
|
||||
return i;
|
||||
a |= b;
|
||||
b >>= 1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t *string) {
|
||||
if (sz == 1)
|
||||
return string[pos];
|
||||
uint32_t v = string[pos] & (0b01111111 >> sz);
|
||||
pos++;
|
||||
for (int i = 1; i < sz; i++){
|
||||
uint32_t th = string[pos];
|
||||
if ((th & 0b11000000) != 0b10000000)
|
||||
return -1;
|
||||
v <<= 6;
|
||||
v |= (th & 0b00111111);
|
||||
pos++;
|
||||
}
|
||||
assert(v <= INT32_MAX);
|
||||
return static_cast<int32_t>(v);
|
||||
}
|
||||
|
||||
#define AAAAAA {cp = -1; return;}
|
||||
|
||||
void utf8_string_iterat(int32_t &cp, size_t &adj, size_t pos, const uint8_t *string, size_t string_size) {
|
||||
if (pos >= string_size) AAAAAA
|
||||
adj = utf8_retrieve_size(string[pos]);
|
||||
if (adj < 0 || pos + adj > string_size) AAAAAA
|
||||
if ((cp = utf8_retrieve_character(adj, pos, string)) < 0) AAAAAA
|
||||
}
|
||||
|
||||
bool is_string_in_stringset(const char *strSample, const char **strSet) {
|
||||
const char** cmpSubject = strSet;
|
||||
while ((*cmpSubject) != NULL){
|
||||
if (strcmp(strSample, *cmpSubject) == 0)
|
||||
return true;
|
||||
cmpSubject++; // += 8 bytes
|
||||
}
|
||||
return false;
|
||||
}
|
21
src/libregexis024vm/utils.h
Normal file
21
src/libregexis024vm/utils.h
Normal file
@ -0,0 +1,21 @@
|
||||
#ifndef LIBREGEXIS024_UTILS_H
|
||||
#define LIBREGEXIS024_UTILS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
void exitf(const char* fmt, ...);
|
||||
|
||||
/* 1, 2, 3, 4 on success; -1 on error */
|
||||
int utf8_retrieve_size(uint8_t firstByte);
|
||||
|
||||
/* sz is a positive value returned by utf8_retrieve_size. Returns negative on error */
|
||||
int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t* string);
|
||||
|
||||
/* cp is negative on error. adj is the size of letter in bytes. Can be used to adjust pos.
|
||||
* All safety checks will be performed */
|
||||
void utf8_string_iterat(int32_t& cp, size_t& adj, size_t pos, const uint8_t* string, size_t string_size);
|
||||
|
||||
bool is_string_in_stringset(const char* strSample, const char* strSet[]);
|
||||
|
||||
#endif //LIBREGEXIS024_UTILS_H
|
26
src/libregexis024vm/vm_errno.cpp
Normal file
26
src/libregexis024vm/vm_errno.cpp
Normal file
@ -0,0 +1,26 @@
|
||||
#include <libregexis024vm/vm_errno.h>
|
||||
|
||||
const char *regex024_error_code_tostr(regex024_error_code x) {
|
||||
#define rcase(name) case regex024_error_codes::name: return #name;
|
||||
switch (x) {
|
||||
rcase(stable)
|
||||
rcase(ca_tree_limit_violation)
|
||||
rcase(sa_length_limit_violation)
|
||||
rcase(read_sslot_count_limit_violation)
|
||||
rcase(fork_sslot_count_limit_violation)
|
||||
rcase(timeout)
|
||||
rcase(improper_finish)
|
||||
rcase(too_early)
|
||||
rcase(too_late)
|
||||
rcase(selection_arr_out_of_range)
|
||||
rcase(read_sslot_out_of_range)
|
||||
rcase(fork_sslot_out_of_range)
|
||||
rcase(invalid_opcode)
|
||||
rcase(invalid_register_code)
|
||||
rcase(instruction_not_for_general_thread)
|
||||
rcase(instruction_not_for_collision_thread)
|
||||
rcase(bad_alloc)
|
||||
default:
|
||||
return "unknown_error_code";
|
||||
}
|
||||
}
|
45
src/libregexis024vm/vm_errno.h
Normal file
45
src/libregexis024vm/vm_errno.h
Normal file
@ -0,0 +1,45 @@
|
||||
#ifndef LIBREGEXIS024_VM_ERRNO_H
|
||||
#define LIBREGEXIS024_VM_ERRNO_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace regex024_error_codes {
|
||||
enum regex024_error_code_I: int {
|
||||
stable = 0,
|
||||
ca_tree_limit_violation = -1,
|
||||
sa_length_limit_violation = -2,
|
||||
read_sslot_count_limit_violation = -3,
|
||||
fork_sslot_count_limit_violation = -4,
|
||||
timeout = -5,
|
||||
/* Threads should be either abandoned by user of virtual machine after MATCH,
|
||||
* ot be stopped by DIE instruction. Out of bound jump is disallowed */
|
||||
improper_finish = -6,
|
||||
/* Operation for general phase is executed in init phase */
|
||||
too_early = -7,
|
||||
/* Operation for init phase is executed in general phase */
|
||||
too_late = -8,
|
||||
/* Used selection array index is out of range */
|
||||
selection_arr_out_of_range = -9,
|
||||
/* Used read slot is out of range */
|
||||
read_sslot_out_of_range = -10,
|
||||
/* Used fork slot is out of range */
|
||||
fork_sslot_out_of_range = -11,
|
||||
|
||||
invalid_opcode = -12,
|
||||
invalid_register_code = -13,
|
||||
/* Next operation scheduled for execution is forbidden in general thread */
|
||||
instruction_not_for_general_thread = -14,
|
||||
/* Next operation scheduled for execution is forbidden in collision thread */
|
||||
instruction_not_for_collision_thread = -15,
|
||||
/* Program willingly threw exception */
|
||||
program_throw = -16,
|
||||
/* O_o */
|
||||
bad_alloc = -17,
|
||||
};
|
||||
}
|
||||
|
||||
typedef regex024_error_codes::regex024_error_code_I regex024_error_code;
|
||||
|
||||
const char* regex024_error_code_tostr(regex024_error_code x);
|
||||
|
||||
#endif //LIBREGEXIS024_VM_ERRNO_H
|
99
src/libregexis024vm/vm_opcodes.h
Normal file
99
src/libregexis024vm/vm_opcodes.h
Normal file
@ -0,0 +1,99 @@
|
||||
#ifndef LIBREGEXIS024_VM_OPCODES_H
|
||||
#define LIBREGEXIS024_VM_OPCODES_H
|
||||
|
||||
#include <libregexis024vm/vm_opcodes_types.h>
|
||||
|
||||
namespace regex024_opcodes {
|
||||
enum regex024_opcode_I: uint8_t{
|
||||
/* READ <Settlement ID> */
|
||||
READ = 0,
|
||||
/* READZ = READ 0 */
|
||||
READZ = 1,
|
||||
/* JUMP <Near pointer> */
|
||||
JUMP = 2,
|
||||
|
||||
/* JCEQUAL - jump conditional (equal): JCEQUAL <s1> <Near pointer> */
|
||||
JCEQUAL_B = 3,
|
||||
JCEQUAL_W = 4,
|
||||
JCEQUAL_DW = 5,
|
||||
JCEQUAL_QW = 6,
|
||||
/* JCLESS - jump conditional (less): JCLESS <s1> <Near pointer> */
|
||||
JCLESS_B = 7,
|
||||
JCLESS_W = 8,
|
||||
JCLESS_DW = 9,
|
||||
JCLESS_QW = 10,
|
||||
/* JCGRTR - jump conditional (greater): JCGRTR <s1> <Near pointer> */
|
||||
JCGRTR_B = 11,
|
||||
JCGRTR_W = 12,
|
||||
JCGRTR_DW = 13,
|
||||
JCGRTR_QW = 14,
|
||||
|
||||
/* FORK <Settlemnt ID> <Near pointer> */
|
||||
FORK = 15,
|
||||
/* MATCH | */
|
||||
MATCH = 16,
|
||||
/* DIE | */
|
||||
DIE = 17,
|
||||
/* PARAM_READ_SS_NUMBER <Settlement ID (length)> */
|
||||
PARAM_READ_SS_NUMBER = 18,
|
||||
/* PARAM_FORK_SS_NUMBER <Settlement ID (length)> */
|
||||
PARAM_FORK_SS_NUMBER = 19,
|
||||
/* PARAM_SELARR_LEN <Track array index (length)> */
|
||||
PARAM_SELARR_LEN = 20,
|
||||
/* PARAM_COLSIFTFUNC_SET <Near pointer> */
|
||||
PARAM_COLSIFTFUNC_SET = 21,
|
||||
/* PARAM_COLSIFTFUNC_WIPE */
|
||||
PARAM_COLSIFTFUNC_WIPE = 22,
|
||||
/* MSG_MULTISTART_ALLOWED <1B> */
|
||||
MSG_MULTISTART_ALLOWED = 23,
|
||||
/* MSG_FED_INPUT_EXTENDED <1B> <1B> <Settlement ID (length of suffix)> */
|
||||
MSG_FED_INPUT_EXTENDED = 24,
|
||||
/* DMOVRABXSELARR <Track array index> */
|
||||
DMOV_RABX_SELARR = 25,
|
||||
/* DDISTRABXSELARR <Track array index> <Track array index> */
|
||||
DDIST_RABX_SELARR = 26,
|
||||
/* SIFTPRIOR_MIN_RABX */
|
||||
SIFTPRIOR_MIN_RABX = 27,
|
||||
/* SIFTPRIOR_MAX_RABX */
|
||||
SIFTPRIOR_MAX_RABX = 28,
|
||||
/* SIFT_DONE */
|
||||
SIFT_DONE = 29,
|
||||
/* MOV_COLARR_IMM <Track array index> <8B> */
|
||||
MOV_COLARR_IMM = 30,
|
||||
/* MOV_COLARR_BTPOS <Track array index> */
|
||||
MOV_COLARR_BTPOS = 31,
|
||||
/* MOV_SELARR_IMM <Track array index> <8B> */
|
||||
MOV_SELARR_IMM = 32,
|
||||
/* MOV_SELARR_CHPOS <Track array index> */
|
||||
MOV_SELARR_CHPOS = 33,
|
||||
/* INIT */
|
||||
INIT = 34,
|
||||
/* THROW */
|
||||
THROW = 35,
|
||||
regex024_opcode_greaterMax = 36
|
||||
};
|
||||
}
|
||||
|
||||
typedef regex024_opcodes::regex024_opcode_I regex024_opcode;
|
||||
|
||||
const char* regex024_opcode_tostr(regex024_opcode x);
|
||||
|
||||
|
||||
constexpr uint64_t REGEX024_BYTECODE_INSTRUCTION_SZ = 1;
|
||||
constexpr uint64_t REGEX024_BYTECODE_SSLOT_ID_SZ = 4;
|
||||
constexpr uint64_t REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ = 2;
|
||||
constexpr uint64_t REGEX024_BYTECODE_NEAR_POINTER_SZ = 8;
|
||||
|
||||
bool vmprog_check_inboundness(regex_near_ptr_t prgSize, regex_near_ptr_t IP, regex_near_ptr_t region);
|
||||
|
||||
uint8_t vmprog_extract_b(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
uint16_t vmprog_extract_w(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
uint32_t vmprog_extract_dw(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
uint64_t vmprog_extract_qw(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
|
||||
uint8_t vmprog_extract_instruction(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t* IPptr, const uint8_t* prg);
|
||||
|
||||
#endif //LIBREGEXIS024_VM_OPCODES_H
|
47
src/libregexis024vm/vm_opcodes_disassembly.cpp
Normal file
47
src/libregexis024vm/vm_opcodes_disassembly.cpp
Normal file
@ -0,0 +1,47 @@
|
||||
#include <libregexis024vm/vm_opcodes.h>
|
||||
|
||||
#ifndef __ORDER_LITTLE_ENDIAN__
|
||||
#error "Big endian is currently unsupported"
|
||||
#endif
|
||||
|
||||
bool vmprog_check_inboundness(regex_near_ptr_t prgSz, regex_near_ptr_t IP, regex_near_ptr_t region) {
|
||||
return IP + region <= prgSz;
|
||||
}
|
||||
|
||||
uint8_t vmprog_extract_b(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
return prg[(*IPptr)++];
|
||||
}
|
||||
|
||||
uint16_t vmprog_extract_w(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
uint16_t answer = *(uint16_t*)(&prg[*IPptr]);
|
||||
*IPptr += 2;
|
||||
return answer;
|
||||
}
|
||||
|
||||
uint32_t vmprog_extract_dw(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
uint32_t answer = *(uint32_t *)(&prg[*IPptr]);
|
||||
*IPptr += 4;
|
||||
return answer;
|
||||
}
|
||||
|
||||
uint64_t vmprog_extract_qw(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
uint64_t answer = *(uint64_t *)(&prg[*IPptr]);
|
||||
*IPptr += 8;
|
||||
return answer;
|
||||
}
|
||||
|
||||
uint8_t vmprog_extract_instruction(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
return vmprog_extract_b(IPptr, prg);
|
||||
}
|
||||
|
||||
regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
return vmprog_extract_dw(IPptr, prg);
|
||||
}
|
||||
|
||||
regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
return vmprog_extract_qw(IPptr, prg);
|
||||
}
|
||||
|
||||
regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t *IPptr, const uint8_t *prg) {
|
||||
return vmprog_extract_w(IPptr, prg);
|
||||
}
|
11
src/libregexis024vm/vm_opcodes_types.h
Normal file
11
src/libregexis024vm/vm_opcodes_types.h
Normal file
@ -0,0 +1,11 @@
|
||||
#ifndef VM_OPCODES_TYPES_H
|
||||
#define VM_OPCODES_TYPES_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef uint32_t regex_sslot_id_t;
|
||||
typedef uint64_t regex_near_ptr_t;
|
||||
typedef uint16_t regex_tai_t;
|
||||
|
||||
|
||||
#endif //VM_OPCODES_TYPES_H
|
Loading…
Reference in New Issue
Block a user