5589 |
04 Sep 19 |
nicklas |
#!/usr/bin/perl |
5589 |
04 Sep 19 |
nicklas |
2 |
use strict; |
5589 |
04 Sep 19 |
nicklas |
3 |
use warnings; |
5589 |
04 Sep 19 |
nicklas |
4 |
use Encode; |
5589 |
04 Sep 19 |
nicklas |
5 |
|
5589 |
04 Sep 19 |
nicklas |
# $Id$ |
5589 |
04 Sep 19 |
nicklas |
# Script provided by Anders Kvist |
5589 |
04 Sep 19 |
nicklas |
8 |
|
5589 |
04 Sep 19 |
nicklas |
9 |
die "Usage $0 <file1> <file2> (<file3> ...)" unless @ARGV >= 2; |
5589 |
04 Sep 19 |
nicklas |
10 |
|
5589 |
04 Sep 19 |
nicklas |
11 |
my @fh = (); |
5589 |
04 Sep 19 |
nicklas |
12 |
for my $i (0 .. $#ARGV) { |
5589 |
04 Sep 19 |
nicklas |
13 |
if ($ARGV[$i] =~ /\.gz$/) { |
5589 |
04 Sep 19 |
nicklas |
14 |
open $fh[$i], "gunzip -c $ARGV[$i] |" or die "Cannot open gunzip pipe for '$ARGV[$i]': $!"; |
5589 |
04 Sep 19 |
nicklas |
15 |
} |
5589 |
04 Sep 19 |
nicklas |
16 |
else { |
5589 |
04 Sep 19 |
nicklas |
17 |
open $fh[$i], '<', $ARGV[$i] or die "Cannot open '$ARGV[$i]' for reading: $!"; |
5589 |
04 Sep 19 |
nicklas |
18 |
} |
5589 |
04 Sep 19 |
nicklas |
19 |
} |
5589 |
04 Sep 19 |
nicklas |
20 |
|
5589 |
04 Sep 19 |
nicklas |
21 |
my $count = 0; |
5589 |
04 Sep 19 |
nicklas |
22 |
while (1) { |
5589 |
04 Sep 19 |
nicklas |
23 |
my @name = (); |
5589 |
04 Sep 19 |
nicklas |
24 |
my @seq = (); |
5589 |
04 Sep 19 |
nicklas |
25 |
my @qual = (); |
5589 |
04 Sep 19 |
nicklas |
26 |
my $count_eof = 0; |
5589 |
04 Sep 19 |
nicklas |
27 |
for my $i (0 .. $#fh) |
5589 |
04 Sep 19 |
nicklas |
28 |
{ |
5589 |
04 Sep 19 |
nicklas |
29 |
while (my $line = readline($fh[$i])) |
5589 |
04 Sep 19 |
nicklas |
30 |
{ |
5589 |
04 Sep 19 |
nicklas |
31 |
chomp $line; |
5589 |
04 Sep 19 |
nicklas |
32 |
if (substr($line, 0, 1) eq '@') |
5589 |
04 Sep 19 |
nicklas |
33 |
{ |
5589 |
04 Sep 19 |
nicklas |
34 |
$name[$i] = $line =~ /^.(\S+)/ ? $1 : ''; |
5589 |
04 Sep 19 |
nicklas |
35 |
$name[$i] =~ s/\/[12]$//; # Remove trailing /1 /2 |
5589 |
04 Sep 19 |
nicklas |
36 |
last; |
5589 |
04 Sep 19 |
nicklas |
37 |
} |
5589 |
04 Sep 19 |
nicklas |
38 |
} |
5589 |
04 Sep 19 |
nicklas |
39 |
if (! defined $name[$i]) |
5589 |
04 Sep 19 |
nicklas |
40 |
{ |
5589 |
04 Sep 19 |
nicklas |
41 |
$count_eof++; |
5589 |
04 Sep 19 |
nicklas |
42 |
next; |
5589 |
04 Sep 19 |
nicklas |
43 |
} |
5589 |
04 Sep 19 |
nicklas |
44 |
$seq[$i] = ''; |
5589 |
04 Sep 19 |
nicklas |
45 |
while (my $line = readline($fh[$i])) |
5589 |
04 Sep 19 |
nicklas |
46 |
{ |
5589 |
04 Sep 19 |
nicklas |
47 |
chomp $line; |
5589 |
04 Sep 19 |
nicklas |
48 |
last if substr($line, 0, 1) eq '+'; |
5589 |
04 Sep 19 |
nicklas |
49 |
$seq[$i] .= $line; |
5589 |
04 Sep 19 |
nicklas |
50 |
} |
5589 |
04 Sep 19 |
nicklas |
51 |
$qual[$i] = ''; |
5589 |
04 Sep 19 |
nicklas |
52 |
while (my $line = readline($fh[$i])) |
5589 |
04 Sep 19 |
nicklas |
53 |
{ |
5589 |
04 Sep 19 |
nicklas |
54 |
chomp $line; |
5589 |
04 Sep 19 |
nicklas |
55 |
$qual[$i] .= $line; |
5589 |
04 Sep 19 |
nicklas |
56 |
last if length($qual[$i]) >= length($seq[$i]); |
5589 |
04 Sep 19 |
nicklas |
57 |
} |
5589 |
04 Sep 19 |
nicklas |
58 |
} |
5589 |
04 Sep 19 |
nicklas |
59 |
|
5589 |
04 Sep 19 |
nicklas |
60 |
if ($count_eof && $count_eof != @fh) |
5589 |
04 Sep 19 |
nicklas |
61 |
{ |
5589 |
04 Sep 19 |
nicklas |
62 |
die "Unexpected end of file: fastq files have different number of entries"; |
5589 |
04 Sep 19 |
nicklas |
63 |
} |
5589 |
04 Sep 19 |
nicklas |
64 |
last if $count_eof; |
5589 |
04 Sep 19 |
nicklas |
65 |
$count++; |
5589 |
04 Sep 19 |
nicklas |
66 |
|
5589 |
04 Sep 19 |
nicklas |
# Indentical names? |
5589 |
04 Sep 19 |
nicklas |
68 |
for my $i (1 .. $#fh) { |
5589 |
04 Sep 19 |
nicklas |
69 |
if ($name[$i] ne $name[0]) { |
5589 |
04 Sep 19 |
nicklas |
70 |
die "Name mismatch at record $count (line " . ($count * 4) . ")\n" |
5589 |
04 Sep 19 |
nicklas |
71 |
. "Names: " . join('|', @name) . "\n" |
5589 |
04 Sep 19 |
nicklas |
72 |
. "Seqs: " . join('|', @seq) . "\n" |
5589 |
04 Sep 19 |
nicklas |
73 |
. "Quals: " . join(' ', @qual) . "\n"; |
5589 |
04 Sep 19 |
nicklas |
74 |
} |
5589 |
04 Sep 19 |
nicklas |
75 |
} |
5589 |
04 Sep 19 |
nicklas |
76 |
|
5589 |
04 Sep 19 |
nicklas |
# Identical seq and qual lengths? |
5589 |
04 Sep 19 |
nicklas |
78 |
for my $i (0 .. $#fh) { |
5589 |
04 Sep 19 |
nicklas |
79 |
if (length(Encode::decode("utf-8", $seq[$i])) != length(Encode::decode("utf-8", $qual[$i]))) { |
5589 |
04 Sep 19 |
nicklas |
80 |
die "Length of seq and qual not equal for file " . ($i + 1) . " at record $count (line " . ($count * 4) . ")" |
5589 |
04 Sep 19 |
nicklas |
81 |
. "Names: " . join('|', @name) . "\n" |
5589 |
04 Sep 19 |
nicklas |
82 |
. "Seqs: " . join('|', @seq) . "\n" |
5589 |
04 Sep 19 |
nicklas |
83 |
. "Quals: " . join(' ', @qual) . "\n"; |
5589 |
04 Sep 19 |
nicklas |
84 |
} |
5589 |
04 Sep 19 |
nicklas |
85 |
} |
5589 |
04 Sep 19 |
nicklas |
86 |
|
5589 |
04 Sep 19 |
nicklas |
87 |
print "\@$name[0]\n" . join('-', @seq) . "\n+\n" . join(' ', @qual) . "\n"; |
5589 |
04 Sep 19 |
nicklas |
88 |
} |