Match DNA 8-mers and substitute nucleotides for IUB code

Author: Daniel Carrera

Based on the submission for Perl 5.

http://benchmarksgame.alioth.debian.org/u32/performance.php?test=regexdna

USAGE: perl6 regex-dna.p6 regex-dna.input

Expected output

agggtaaa|tttaccct 0
[cgt]gggtaaa|tttaccc[acg] 3
a[act]ggtaaa|tttacc[agt]t 9
ag[act]gtaaa|tttac[agt]ct 8
agg[act]taaa|ttta[agt]cct 10
aggg[acg]aaa|ttt[cgt]ccct 3
agggt[cgt]aa|tt[acg]accct 4
agggta[cgt]a|t[acg]taccct 3
agggtaa[cgt]|[acg]ttaccct 5

101745
100000
133640

Source code: regex-dna.p6

use v6;

sub MAIN($input-file = $*SPEC.catdir($*PROGRAM-NAME.IO.dirname, "regex-dna.input")) {
    my $input = $input-file.IO.slurp;
    my $data = $input.lines.grep({ $_ !~~ /^ \>/}).join.lc;

    say $_ ~ ' ' ~ +$data.comb($_) for
        /agggtaaa|tttaccct/             but 'agggtaaa|tttaccct',
        /<[cgt]>gggtaaa|tttaccc<[acg]>/ but '[cgt]gggtaaa|tttaccc[acg]',
        /a<[act]>ggtaaa|tttacc<[agt]>t/ but 'a[act]ggtaaa|tttacc[agt]t',
        /ag<[act]>gtaaa|tttac<[agt]>ct/ but 'ag[act]gtaaa|tttac[agt]ct',
        /agg<[act]>taaa|ttta<[agt]>cct/ but 'agg[act]taaa|ttta[agt]cct',
        /aggg<[acg]>aaa|ttt<[cgt]>ccct/ but 'aggg[acg]aaa|ttt[cgt]ccct',
        /agggt<[cgt]>aa|tt<[acg]>accct/ but 'agggt[cgt]aa|tt[acg]accct',
        /agggta<[cgt]>a|t<[acg]>taccct/ but 'agggta[cgt]a|t[acg]taccct',
        /agggtaa<[cgt]>|<[acg]>ttaccct/ but 'agggtaa[cgt]|[acg]ttaccct';

    say();

    my %iub = 'b' => '(c|g|t)', 'd' => '(a|g|t)', 'h' => '(a|c|t)',
              'k' => '(g|t)',   'm' => '(a|c)',   'n' => '(a|c|g|t)',
              'r' => '(a|g)',   's' => '(c|g)',   'v' => '(a|c|g)',
              'w' => '(a|t)',   'y' => '(c|t)';

    my $output = $data.subst(/(<[bdhkmnrsvwy]>)/, { %iub{$_} }, :g);

    .chars.say for $input, $data, $output;
}