use FileHandle; if( @ARGV < 3 ) { print "usage: perl blastz.to.fasta.pl fasta_file1 fasta_file2 blastz_file \n"; exit( 1 ); } elsif( @ARGV == 3 ) { $fasta1 = $ARGV[0]; $fasta2 = $ARGV[1]; $blastzFile = $ARGV[2]; $minPctId = 0; } else { $fasta1 = $ARGV[0]; $fasta2 = $ARGV[1]; $blastzFile = $ARGV[2]; $minPctId = $ARGV[3];; } foreach $fastaFile ($fasta1, $fasta2) { $fh = new FileHandle; $fh->open( $fastaFile ) or die "Can't open $fastaFile: $!"; while( $line = <$fh> ) { $line = CleanLine( $line ); if( $line =~ /^>/ ) { @data = split( /\s+/, $line ); $id = substr( $data[0], 1 ); $seqLine{$id} = ""; } else { $seqLine{$id} .= $line; } } $fh->close(); } $fh = new FileHandle; $fh->open( $blastzFile ) or die "Can't open $blastzFile: $!"; while( $line = <$fh> ) { chomp( $line ); if( $line =~ /^h {/ ) { $line = <$fh>; $line = CleanLine( $line ); $line =~ s/\"//g; $hdr1 = $line; @data = split( /\s+/, $hdr1 ); $id1 = substr( $data[0], 1 ); $line = <$fh>; $line = CleanLine( $line ); $line =~ s/\"//g; $hdr2 = $line; @data = split( /\s+/, $hdr2 ); $id2 = substr( $data[0], 1 ); last if( $id2 =~ /\(reverse\scomplement\)$/ ); # don't deal with reverse alignments } elsif( $line =~ /^a {/ ) { $line = <$fh>; $line = CleanLine( $line ); @data = split( /\s+/, $line ); $score = $data[1]; while( $line = <$fh> ) { last if( $line =~ /^}/ ); $line = CleanLine( $line ); if( $line =~ /^l/ ) { @data = split( /\s+/, $line ); $p = {}; $p->{start1} = $data[1] - 1; $p->{end1} = $data[3] - 1; $p->{len1} = $p->{end1} - $p->{start1} + 1; $p->{start2} = $data[2] - 1; $p->{end2} = $data[4] - 1; $p->{len2} = $p->{end2} - $p->{start2} + 1; $p->{pct} = $data[5]; $p->{hdr1} = $hdr1; $p->{hdr2} = $hdr2; $p->{score} = $score; $p->{use} = 1; die "Error: lengths don't match '$line'" if( $p->{len1} != $p->{len2} ); push @alignList, $p; } } } } $fh->close(); @alignList = sort ByStart1 @alignList; $prev = ""; $i = 0; while( $i < @alignList) { $overlap = 1; while( $overlap == 1 ) { $overlap = 0; $p = $alignList[$i]; if( $prev ) { if( $prev->{end1} >= $p->{start1} ) { $overlap = 1; print STDERR "$p->{hdr1}\n"; print STDERR "$p->{hdr2}\n"; print STDERR "seq 1: ($prev->{start1}, $prev->{end1}) overlaps ($p->{start1}, $p->{end1})\n"; } elsif( $prev->{end2} >= $p->{start2} ) { $overlap = 1; print STDERR "$p->{hdr1}\n"; print STDERR "$p->{hdr2}\n"; print STDERR "seq 2: ($prev->{start2}, $prev->{end2}) overlaps ($p->{start2}, $p->{end2})\n"; } if( $overlap ) { $p->{use} = 0; print STDERR "eliminating ($p->{start1}, $p->{end1}) ($p->{start2}, $p->{end2})\n\n"; $i++; } } } $prev = $p; $i++; } foreach $p (@alignList) { push @alignList2, $p if( $p->{use} ); } $alignSeq1 = ""; $alignSeq2 = ""; $prev = ""; foreach $p (@alignList2) { $hdr1 = $p->{hdr1}; $hdr2 = $p->{hdr2}; if( $prev ) { for( $i = $prev->{end1} + 1; $i <= $p->{start1} - 1; $i++ ) { $alignSeq1 .= substr( $seqLine{$id1}, $i, 1 ); $alignSeq2 .= "N"; } for( $i = $prev->{end2} + 1; $i <= $p->{start2} - 1; $i++ ) { $alignSeq1 .= "N"; $alignSeq2 .= substr( $seqLine{$id2}, $i, 1 ); } } if( $p->{pct} >= $minPctId ) { $alignSeq1 .= substr( $seqLine{$id1}, $p->{start1}, $p->{len1} ); $alignSeq2 .= substr( $seqLine{$id2}, $p->{start2}, $p->{len2} ); } else { for( $i = 0; $i < $p->{len1}; $i++ ) { $alignSeq1 .= "N"; $alignSeq2 .= "N"; } } $prev = $p; } $len1 = length( $alignSeq1 ); $len2 = length( $alignSeq2 ); die "Sequence lengths are not equal" if( $len1 != $len2); PrintSeq( $hdr1 . " aligned len: $len1", $alignSeq1 ); PrintSeq( $hdr2 . " aligned len: $len2", $alignSeq2 ); sub ByStart1 { $a->{start1} <=> $b->{start1} or $a->{score} <=> $b->{score}; } sub PrintSeq { my $hdr = shift @_; my $seq = shift @_; print "$hdr\n"; for( my $i = 0; $i < length( $seq ); $i += 60 ) { print substr( $seq, $i, 60 ), "\n"; } print "\n"; } sub CleanLine { my $line = shift @_; chomp( $line ); $line =~ s/\r//g; return trim( $line ); } sub IsOnList { my( $target, @list ) = @_; if( ! @list ) { return 0; } foreach my $s (@list) { return 1 if( $s eq $target); } return 0; } sub trim { my @out = @_; for (@out) { s/^\s+//; s/\s+$//; } return wantarray ? @out : $out[0]; }