#!/usr/local/bin/perl -w
# Daniel Brami
# Util to split interlaced FASTQ files into pairs
use strict;
# Standard lib
use IO::File;
use File::Basename;
my $INPUT=shift;
if (!(defined ($INPUT)) || ($INPUT =~ '^\-')){
die "Usage: $0 <interleaved paired FASTQ file>\n";
}
my ($name,$path,$suffix) = fileparse($INPUT, qw/fastq FASTQ txt TXT/);
my $FH_IN = new IO::File($INPUT, "r") or die "could not open $INPUT: $!\n";
my $FH_OUT1 = new IO::File($name."split1.$suffix", "w") or die "could not open $name.split1$suffix for writing: $!\n";
my $FH_OUT2 = new IO::File($name."split2.$suffix", "w") or die "could not open $name.split2$suffix for writing: $!\n";
my ($recs1, $recs2)= (0,0);
my $flipflop = 1;
my $counter = 0;
my ($line, $TXT);
while($line = $FH_IN->getline()){
$TXT .= $line;
$counter++;
if($counter == 4){
if($flipflop == 1){
print $FH_OUT1 $TXT;
++$recs1;
}else{
print $FH_OUT2 $TXT;
++$recs2;
}
$counter = 0;
$TXT = '';
$flipflop *= -1;
}
}
$FH_IN->close();
$FH_OUT1->close();
$FH_OUT2->close();
print STDERR "Processed $recs1 records for pair file 1 and $recs2 records for pair file 2.\n";
if($recs1 != $recs2){
print STDERR "The number of processed records does not match - check input data!";
}exit;
Wednesday, March 23, 2011
Split fastq mates
Here is code in perl to split a single paired interleaved FASTQ file into a pair of FASTQ files:
Subscribe to:
Post Comments (Atom)
My fastQ file looks like this:
ReplyDelete@GZTMLPY01DG8AY/1
ATTCAATTTATTGTTACTCAAATCGTCTGCATTGAAA
+GZTMLPY01DG8AY/1
EE<@;;999:33@88@;A@244=AIIHHIEE;;;666
@GZTMLPY01DG8AY/2
AGGTGCCGTTGCGTCAGCGAATGACTTTAAGATCCCTCGAACATGCATGCAATGTTCCAGCG
+GZTMLPY01DG8AY/2
IIIIIIIIIIIIGECEIIIIGGGIGGGHIIIIIFDDID????FIGGGGGIIC?431DDEEA@
After splitting the first 2 lines of the new fastq looks like:
paired.split1.fastq:
@GZTMLPY01DG8AY/1
ATTCAATTTATTGTTACTCAAATCGTCTGCATTGAAA
paired.split2.fastq
EE<@;;999:33@88@;A@244=AIIHHIEE;;;666
@GZTMLPY01DG8AY/2
file2 is wrong.
Yeah, he has the $counter set to 1 initially but it should be 0. Makes you wonder if the program was ever tested.
ReplyDeleteThere are also some weird syntax. For example setting $flipflop to -1 via a multiplication ... that is strange. But there is more than one way to do things so I won't complain too much.
Also what happens if you do not have perfectly interleaved sequences? There is no double check to check this.
Henri and Rick - you are right, both $counter variables should have been set to 0; it is now corrected;
ReplyDeleteHenri: if you really feel I need to put a verification step, send me the code and I'll paste it in.