johnchen902's Blog

Expression

Problem

Find the shortest string that matches a given regular expression and contains a given substring. The regular expression consists only of a to z, dot, alternation, concatenation, and kleene star. The length of both input are no more than 10000.

Solution

You can solve this problem only if you can check if the given string matches the regular expression. We can easily construct an equivalent NFA with only O(n) states and total transitions, but how to proceed from here? We can't construct a DFA because it takes exponential time.

We can proceed by first recording the states reachable with the first character of the given string. Then, we can determine the states reachable with the first two characters in O(n). After m similar steps, we can determine the states reachable with the entire string. That means we can check if the NFA accepts a given string in O(nm).

Alternatively, we can proceed with dynamic programming. The data we memorize is whether the accept state can be reached from this state with this suffix of the given string, and the transition is straight-forward. To prevent cyclic dependencies, we have to eliminate ε-cycles beforehand. The time complexity is also O(nm).

Note: For an NFA with k transitions the complexity is O((n+k)m) for both algorithm.

Back to the original problem. We can find the minimum characters needed from each state to the accept state easily with a backward bfs. Then we can change the latter algorithm to memorizing the minimum characters needed to reach the accept state from this state with this suffix of the given string as prefix. One more backward bfs and we can determine the length of the answer. Finally we can reconstruct the answer by backtracking.

On Codeforces the memory limit is 256MiB. I have to use int16_t and can only afford n+1 states in the NFA.

Source Code

#include <cstdio>
#include <vector>
#include <stack>
#include <algorithm>
#include <utility>
#include <cassert>
#include <queue>
#include <cstring>
#include <functional>
using namespace std;

vector<pair<char, int>> adj[10001];
int nstates;
int new_state() {
    return nstates++;
}

int parse_a(int s);
int parse_b(int s);
int parse_c(int s);

int parse_a(int s) {
    int t = parse_b(s);
    while(true) {
        char c = getchar();
        if(c == '\n' || c == ')') {
            ungetc(c, stdin);
            break;
        }
        assert(c == '|');
        int s2 = new_state();
        adj[s].emplace_back('\0', s2);
        int t2 = parse_b(s2);
        adj[t2].emplace_back('\0', t);
    }
    return t;
}

int parse_b(int s) {
    while(true) {
        char c = getchar();
        ungetc(c, stdin);
        if(c == ')' || c == '|' || c == '\n')
            break;
        s = parse_c(s);
    }
    return s;
}

void ignore_more_stars() {
    int c;
    while((c = getchar()) == '*')
        ;
    ungetc(c, stdin);
}

int parse_c(int s) {
    int c = getchar();
    if(c == '.' || (c >= 'a' && c <= 'z')) {
        char d = getchar();
        if(d == '*') {
            int t = new_state();
            adj[s].emplace_back('\0', t);
            adj[t].emplace_back(c, t);
            ignore_more_stars();
            return t;
        } else {
            ungetc(d, stdin);
            int t = new_state();
            adj[s].emplace_back(c, t);
            return t;
        }
    } else if(c == '*') {
        ignore_more_stars();
        return s;
    } else {
        assert(c == '(');
        int s2 = new_state();
        adj[s].emplace_back('\0', s2);
        int t = parse_a(s2);
        char rparen = getchar();
        assert(rparen == ')');
        char d = getchar();
        if(d == '*') {
            adj[t].emplace_back('\0', s2);
            ignore_more_stars();
            return s2;
        } else {
            ungetc(d, stdin);
            return t;
        }
    }
}

int id[10001], low[10001], nextid = 1;
bool onstack[10001];
stack<int> tjstack;
int mapp[10001];
void torjan(int i) {
    onstack[i] = true;
    tjstack.push(i);
    low[i] = id[i] = nextid++;
    for(auto p : adj[i]) {
        if(p.first)
            continue;
        int c = p.second;
        if(!id[c]) {
            torjan(c);
            low[i] = min(low[i], low[c]);
        } else if(onstack[c]) {
            low[i] = min(low[i], id[c]);
        }
    }
    if(low[i] == id[i]) {
        int x;
        do {
            x = tjstack.top();
            tjstack.pop();
            onstack[x] = false;
            mapp[x] = i;
        } while(x != i);
    }
}
void eliminate() {
    for(int i = 0; i < nstates; i++)
        if(id[i] == 0)
            torjan(i);
    for(int i = 0; i < nstates; i++)
        for(auto &p : adj[i])
            p.second = mapp[p.second];
    for(int i = 0; i < nstates; i++)
        if(i != mapp[i]) {
            adj[mapp[i]].insert(adj[mapp[i]].end(),
                    adj[i].begin(), adj[i].end());
            adj[i].clear();
        }
    for(int i = 0; i < nstates; i++)
        adj[i].erase(remove(adj[i].begin(), adj[i].end(),
                make_pair('\0', i)), adj[i].end());
}

vector<pair<bool, int>> rev[10001];
template<typename T>
void backward_bfs(int t, T *out) {
    for(int i = 0; i < nstates; i++)
        for(auto p : adj[i])
            rev[p.second].emplace_back(p.first, i);
    fill_n(out, nstates, 32000);
    using P = pair<int, int>;
    priority_queue<P, vector<P>, greater<P>> pq;
    out[t] = 0;
    pq.push(make_pair(0, t));
    while(!pq.empty()) {
        P p = pq.top();
        pq.pop();
        if(p.first != out[p.second])
            continue;
        for(auto q : rev[p.second])
            if(out[q.second] > p.first + (q.first ? 1 : 0)) {
                out[q.second] = p.first + (q.first ? 1 : 0);
                pq.push(make_pair(out[q.second], q.second));
            }
    }
}

char str[10001];
int16_t dp[10001][10001];

int dfs(int st, int si, int n) {
    if(si == n || dp[si][st])
        return dp[si][st];
    int ans = 32000;
    for(auto p : adj[st])
        if(p.first == '\0') {
            ans = min(ans, dfs(p.second, si, n));
        } else if(p.first == '.' || p.first == str[si]) {
            ans = min(ans, dfs(p.second, si + 1, n) + 1);
        }
    return dp[si][st] = ans;
}

int ans[10001];
void backward_bfs2() {
    copy_n(dp[0], nstates, ans);
    using P = pair<int, int>;
    priority_queue<P, vector<P>, greater<P>> pq;
    for(int i = 0; i < nstates; i++)
        if(i == mapp[i])
            pq.push(make_pair(ans[i], i));
    while(!pq.empty()) {
        P p = pq.top();
        pq.pop();
        if(p.first != ans[p.second])
            continue;
        for(auto q : rev[p.second])
            if(ans[q.second] > p.first + (q.first ? 1 : 0)) {
                ans[q.second] = p.first + (q.first ? 1 : 0);
                pq.push(make_pair(ans[q.second], q.second));
            }
    }
}

void myputchar(char c) {
    if(c == '.')
        putchar('a');
    else if(c >= 'a' && c <= 'z')
        putchar(c);
}

void answer(int x, int t, int n) {
    while(dp[0][x] > ans[x])
        for(auto p : adj[x])
            if(ans[p.second] + (p.first ? 1 : 0) == ans[x]) {
                myputchar(p.first);
                x = p.second;
                break;
            }
    for(int i = 0; i < n;)
        for(auto p : adj[x])
            if(p.first == '\0') {
                if(dp[i][p.second] == dp[i][x]) {
                    x = p.second;
                    break;
                }
            } else if(p.first == '.' || p.first == str[i]) {
                if(dp[i + 1][p.second] == dp[i][x] - 1) {
                    myputchar(str[i++]);
                    x = p.second;
                    break;
                }
            }
    while(x != t)
        for(auto p : adj[x])
            if(dp[n][p.second] + (p.first ? 1 : 0) == dp[n][x]) {
                myputchar(p.first);
                x = p.second;
                break;
            }
    putchar('\n');
}

int main() {
    // parse input
    int s = new_state();
    int t = parse_a(s);
    scanf("%s", str);
    int n = strlen(str);

    // eliminate ε-cycles
    eliminate();
    s = mapp[s];
    t = mapp[t];
    // backward bfs
    backward_bfs(t, dp[n]);
    // dp
    for(int i = 0; i < nstates; i++)
        if(i == mapp[i])
            dfs(i, 0, n);
    // backward bfs 2
    backward_bfs2();
    // finally
    if(ans[s] >= 31000) {
        puts("NO");
        return 0;
    }
    answer(s, t, n);
}