Generate Matching Regexes

Question

Write a program that takes two lists of strings and generates a javascript regex that fully matches all the strings in the first list and matches none of the strings in the second list.

To be specific when for all stings in the first list str.match(/[your-output]/) must produce a match for the full string and for all strings in the second list str.match(/[your-output]/) must produce no matches.

Scoring

Your score is the average length of the regexes you produce not the length of your program. Your regexes should be tested against this file: https://pastebin.com/9iNZvGJC. Take the file in blocks of 20 lines and run your program with the first ten lines to match and the second ten to not match. Average the lengths of your program's output for all of the blocks in the file. This is your score.

Rules

Do not output the / for the regex

Was about to VTC as dupe of this old classic, but then looking at it I'm not actually sure how it was supposed to be scored... — Unrelated String
– Unrelated String, Commented Jan 22, 2022 at 2:38
Do you mean "Do not output the \ for the regex" or "Do not output the / for the regex"? — Fmbalbuena
– Fmbalbuena, Commented Jan 22, 2022 at 11:34
Is that OK if our program is optimized for the battery test and may fail on other inputs? — Arnauld
– Arnauld, Commented Jan 23, 2022 at 1:06
It should work for all inputs but of course it can work better for the battery. — user197974
– user197974, Commented Jan 23, 2022 at 9:26

Arnauld · Accepted Answer · 2022-01-23 13:41:01Z

JavaScript (ES7), Score: 767 / 50 = 15.34

This code is rather ugly and implements only one strategy. (I did try negative lookaheads, but it was not worth the effort and is disabled here.)

It could probably shave off a few more bytes with a higher maximum branching factor and/or a more clever pattern scoring function. For instance, /.*[LObcln]o.*/ and /.*(Y|o[otvy]).*/ are better solutions for the first 2 test cases which are currently not found.

The TIO link only includes the first 40 lines.

const MAX_PAT_LEN = 2;
const MAX_BF = 8;
const ESC = ".^$*+?()[{\\|";

let list = require("fs").readFileSync(0).toString().split("\n");
let block = [];

for(let n = 0; list[n]; n += 20) {
  block.push([list.slice(n, n + 10), list.slice(n + 10, n + 20)]);
}

let score = 0;

block.forEach(([A, B]) => {
  let res = process(A, B, false);

  console.log(res.length.toString().padStart(2) + " /" + res + "/");

  if(
    A.some(s => (s.match(RegExp(res)) || [])[0] != s) ||
    B.some(s => s.match(RegExp(res)))
  ) {
    throw "Failed";
  }

  score += res.length;
});

console.log(score);

function process(A, B, neg) {
  let chr = [...new Set([...A, ...B].join(''))]
            .map(c => ~ESC.indexOf(c) ? "\\" + c : c);
  let set = new Set;

  A.forEach(s => {
    for(let n = 1; n <= MAX_PAT_LEN; n++) {
      choose(chr, n).forEach(a => {
        let pat = a.join(''),
            regex = RegExp(pat);

        if(regex.test(s) && !B.some(s => regex.test(s))) {
          set.add(pat.toString());
        }
      });
    }
  });

  function score(A, pat) {
    return A.reduce((p, s) => p + RegExp(pat).test(s), 0);
  }

  let solution = [];

  function search(A, expr) {
    if(!A.length) {
      let c = compress(expr);

      solution.push(neg ? "^(?!.*" + c + ").*" : ".*" + c + ".*");
      return;
    }

    let patList = [...set].filter(a => score(A, a))
                  .sort((a, b) => score(A, b) - score(A, a));

    patList.slice(0, MAX_BF).forEach(pat => {
      search(A.filter(s => !RegExp(pat).test(s)), [...expr, pat]);
    });
  }

  search(A, []);

  return solution.sort((a, b) => a.length - b.length)[0];
}

function choose(list, n) {
  let sz = list.length;
  let max = sz ** n;
  let res = [];

  for(let k = 0; k < max; k++) {
    let sel = [];

    for(let i = 0; i < n; i++) {
      sel.push(list[Math.floor(k / sz ** i) % sz]);
    }
    res.push(sel);
  }
  return res;
}

function compress(list) {
  let maxW = Math.max(...list.map(s => s.length)),
      res = [];

  list.sort();

  for(let w = 1; w <= maxW; w++) {
    let subList = list.filter(s => s.length == w);

    for(let i = 0; i < w; i++) {
      let trunc = {};

      subList.forEach(s => {
        let key = s.slice(0, i) + s.slice(i + 1);
        trunc[key] = (trunc[key] || '') + s[i];
      });

      Object.keys(trunc).forEach(k => {
        let s0 = trunc[k].length + 2 + w,
            s1 = trunc[k].length * (w + 1);

        if(s0 <= s1) {
          res.push(k.slice(0, i) + '[' + trunc[k] + ']' + k.slice(i));
          list = list.filter(s =>
            s.length != w || s.slice(0, i) + s.slice(i + 1) != k
          );
        }
      });
    }
  }

  list = [...list, ...res];

  return list.length > 1 ? "(" + list.join('|') + ")" : list.join('|');
}

Try it online!

Full output

14 /.*(an|la|ts).*/
17 /.*( n|Y|o[otv]).*/
19 /.*(J|ca|go|e[dy]).*/
16 /.*(j|ns|s,|t').*/
19 /.*(ke|wh|i[,dgl]).*/
12 /.*( c|W|p).*/
16 /.*(e'|j|nn|ve).*/
18 /.*(B|T|op|r,|un).*/
12 /.*a[ btuy].*/
16 /.*('s|3|r[ i]).*/
13 /.*(H|ar|po).*/
14 /.*(I |ie|nd).*/
16 /.*(sa|[nrs]\.).*/
17 /.*(Ye|ay|it|wh).*/
13 /.*("|nt|we).*/
16 /.*( f|A|He|am).*/
17 /.*(ce|id|ma|r ).*/
16 /.*(nk|o[bfpw]).*/
13 /.*( r|T|fl).*/
14 /.*(y |[1SW]).*/
18 /.*('l|un|[btw]e).*/
14 /.*(L| [Obe]).*/
15 /.*( p|-|eb|j).*/
13 /.*(H|ma|se).*/
10 /.*[clt]e.*/
15 /.*(G|ne|ry|x).*/
16 /.*(J|S|o[dos]).*/
16 /.*(Al|W|oo|w ).*/
16 /.*(T|j|['cn]t).*/
16 /.*(s\.|r[osy]).*/
18 /.*(E|il|[Inrs]s).*/
16 /.*(S|n'|su|uc).*/
16 /.*( w|s[!emo]).*/
15 /.*(!|K|ea|rr).*/
16 /.*(D|Yo|h,|si).*/
11 /.*[dhlr]i.*/
17 /.*(ll|st| [rv]).*/
15 /.*(!|H|a[ds]).*/
16 /.*( f|F|ic|wa).*/
16 /.*(ev|[abgu]r).*/
17 /.*(ti|ul|[AFS]).*/
16 /.*(ed| [RYly]).*/
14 /.*(ea|a[ b]).*/
18 /.*(Oa|k |['be]l).*/
15 /.*(\? |ni|ot).*/
13 /.*("|al|ta).*/
11 /.*(f |pa).*/
15 /.*(L|Re|W|nk).*/
18 /.*(fl|ly|[enr]t).*/
17 /.*('t|om|[dt]e).*/

Just a note. The regexes are supposed to match the full string not just a part of it. — user197974
– user197974, Commented Jan 23, 2022 at 9:26

Ajax1234 · Accepted Answer · 2023-09-18 03:13:17Z

Python3, Score: 762/50 = 15.24 in ~30.51 seconds

import collections, re, random

def test_data():
    with open('inp_d.txt') as f:
        while True:
            l1 = [j.strip('\n') for _ in range(10) if (j:=next(f, None))]
            l2 = [j.strip('\n') for _ in range(10) if (j:=next(f, None))]
            if len(l1) < 10: return
            yield l1, l2
            if len(l2) < 10: return

def substrings(l1, l2):
    C = collections.defaultdict(set)
    C1 = collections.defaultdict(set)
    for I, l in enumerate(l1):
        for i in range(len(l)):
            for j in range(i+1, len(l)):
                L = l[i:j]
                if all(L not in k for k in l2):
                    C[j-i].add(L)
                    C1[L].add(I)

    return {a:sorted(b) for a,b in C.items()}, {a:sorted(b) for a, b in C1.items()}

def to_regexp(k):
    return ['', '['+''.join(k[0])+']'][k[0]!=tuple()]+k[1]+['', '['+''.join(k[2])+']'][k[2]!=tuple()]

def full_regexp(regexp):
    return regexp[0] if len(regexp) == 1 else '('+'|'.join(regexp)+')'

def merges(d, d1, depth = 2):
    for D in range(1, depth + 1):
        b = d[D]
        results = collections.defaultdict(dict)
        for x in range(D+1):
            for y in range(x+1, D+1):
                for i, a in enumerate(b):
                    if a[x:y] not in results[(x, y)]:
                        results[(x, y)][a[x:y]] = [i]
                    else:
                        results[(x, y)][a[x:y]].append(i)
        
        for (x, y), vals in results.items():
            for base, options in vals.items():
                for i, a in enumerate(options):
                    queue = [(options[i+1:], {*b[a][:x]}, base, {*b[a][y:]}, d1[b[a]])]
                    while queue:
                        r_options, l, base, r, o = queue.pop(0)
                        yield (tuple(sorted(filter(None, l))), base, tuple(sorted(filter(None, r)))), o
                        if r_options:
                            queue.append((r_options[1:], l, base, r, o))
                            n_b = b[r_options[0]]
                            if any(j not in o for j in d1[n_b]):
                                queue.append((r_options[1:], {*l, n_b[:x]}, base, {*r, n_b[y:]}, {*o, *d1[n_b]}))

def solutions(l1, l2, depth = 2):
    d, d1 = substrings(l1, l2)
    merge = dict(merges(d, d1))
    
    scores = collections.defaultdict(dict)
    for a, b in merge.items():
        T = to_regexp(a)
        if tuple(b) not in scores[10 - len(b)]:
            scores[10 - len(b)][tuple(b)] = [T]
        else:
            scores[10 - len(b)][tuple(b)].append(T)

    new_scores = {}
    vals = []
    for a, b in scores.items():
        subscores = {}
        for j, k in b.items():
            subscores[j] = min(k, key=len)
            vals.append((j, min(k, key=len)))
        new_scores[a] = subscores
    
    def contains(j, k, nj, nk):
        if nj == nk:
            return False

        return all(J in nj for J in j) and len(k) > len(nk)

    scores = {a:{j:k for j, k in b.items() if not any(contains(j, k, nj, nk) for nj, nk in vals)} for a, b in new_scores.items()}
    r_score, r_regexp = None, None
    queue = [(score, {*vals}, [regexp]) for score, container in sorted(scores.items(), key=lambda x:x[0]) for vals, regexp in container.items()]
    seen = []
    while queue:
        score, vals, regexp = queue.pop(0)
        if r_score is not None:
            if len(full_regexp(regexp)) >= r_score:
                continue

        if not score:
            if r_score is None:
                r_score, r_regexp = len(T:=full_regexp(regexp)), T

            elif len(T:=full_regexp(regexp)) < r_score:
                r_score, r_regexp = len(T), T
            
            continue

        remainder_vals = {*range(10)} - vals
        full_options = []
        for score, options in scores.items():
            for _vals, r_o in options.items():
                if any(i in remainder_vals for i in _vals) and len(r_o) <= len(regexp[-1]) and tuple(P:=sorted(regexp + [r_o])) not in seen:
                    if r_score is None or len(full_regexp(regexp + [r_o])) < r_score:
                        full_options.append(({*_vals} - {*vals}, _vals, r_o))

        for _, _vals, r_o in sorted(full_options, key=lambda x:len(x[0]), reverse=True)[:5]:
            seen.append(tuple(P:=sorted(regexp + [r_o])))
            queue.append((len(remainder_vals - {*_vals}), {*vals, *_vals}, regexp + [r_o]))

    return r_score + 4, '.*'+r_regexp+'.*' 
    
if __name__ == '__main__':
    import time
    T = time.time()
    s, c = 0, 0
    for l1, l2 in test_data():
        a, b = solutions(l1, l2)
        s += a
        c += 1
        print(b)
        
    print(time.time() - T)
    print(s, c, s/c)

Rather long solution, but more optimized for speed and regexp length minimization.

Stack Exchange Network

Generate Matching Regexes

Scoring

Rules

2 Answers 2

JavaScript (ES7), Score: 767 / 50 = 15.34

Full output

Python3, Score: 762/50 = 15.24 in ~30.51 seconds

Your Answer

Linked

Hot Network Questions

Generate Matching Regexes

Scoring

Rules

2 Answers 2

JavaScript (ES7), Score: 767 / 50 = 15.34

Full output

Python3, Score: 762/50 = 15.24 in ~30.51 seconds

Your Answer

Sign up or log in

Post as a guest

Linked

Related

Hot Network Questions