#! /usr/local/bin/perl # Copyright (C)) 2005 Applied Biosystems. # This file may be copied and redistributed freely, without advance permission, # provided that this Copyright statement is reproduced with each copy. # LIMITATION OF WARRANTY # NOTHING IN THIS AGREEMENT WILL BE CONSTRUED AS A REPRESENTATION MADE OR # WARRANTY GIVEN BY APPLIED BIOSYSTEMS OR ANY THIRD PARTY THAT THE USE OF # DATA PROVIDED HEREUNDER WILL NOT INFRINGE ANY PATENT, COPYRIGHT, TRADEMARK # OR OTHER RIGHTS OF ANY THIRD PARTY. DATA IS PROVIDED "AS IS" WITHOUT # WARRANTY OF ANY KIND WHATSOEVER, EXPRESS OR IMPLIED, INCLUDING IMPLIED # WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. APPLIED # BIOSYSTEMS MAKES NO WARRANTY THAT ITS DATA DOES NOT CONTAIN ERRORS. package Hit; use Hsp; 1; # return value for lib loader sub new { my $class = shift; my @hitText = @_; my $self = {}; bless $self, $class; $self->init(@hitText); return $self; } sub init { my $self = shift; $self->{parent} = shift; my @hitText = @_; # attributes for hit text & hsp retrieval $self->{hitText} = \@hitText; $self->{numLines} = $#hitText; $self->{curLine} = 0; $self->{noMoreHits} = undef; # parsing setup $self->{qStart} = 999999; $self->{hStart} = 999999; $self->{qEnd} = -1; $self->{hEnd} = -1; $self->{hitRanges} = (); $self->{queryRanges} = (); # find all appropriate values foreach $line (@hitText) { if ($line =~ /^>([\S]+)/) { $self->{name} = $1 unless ($self->{name}); chomp $line; $self->{desc} = $line; } if (($line =~ /Length = ([\d]+)/)) { $self->{length} = $1 unless ($self->{length}); } if (($line =~ /Expect(\(\d+\))? = (.*)/)) { $self->{expect} = $2 unless ($self->{expect}); } if (($line =~ /Score =[\s]+([\d\.]+) bits \(([\d]+)\),/)) { $self->{bits} = $1 unless ($self->{bits}); $self->{score} = $2 unless ($self->{score}); } if(($line =~ m!Identities = ([\d]+)/([\d]+)!)){ $self->{hitId} += $1 unless ($self->{hitId}); $self->{length_aln} += $2 unless ($self->{length_aln}); } if($line =~ m!Positives = ([\d]+)!){ $self->{hitCons} += $1 unless ($self->{hitCons}); } if ($line =~ m!^Query: ([\d]+) .* ([\d]+)!) { $self->{qStart} = ($1<$self->{qStart})?$1:$self->{qStart}; $self->{qEnd} = ($2>$self->{qEnd})?$2:$self->{qEnd}; } if ($line =~ m!^Sbjct: ([\d]+) .* ([\d]+)!) { $self->{hStart} = ($1<$self->{hStart})?$1:$self->{hStart}; $self->{hEnd} = ($2>$self->{hEnd})?$2:$self->{hEnd}; } } # once we've processed the macro values, then do tiling $self->calcTiledRanges(); } sub name { my $self = shift; return $self->{name}; } #### # sub id is a MAG-specific function. It returns the requested ID portion # of a NCBI-style complex name, or the name of the sequence if not in NCBI # format # # params: $type = {gi|spid|spacc}. If undef, returns gi # return: the appropriate id of the sequence, or the bare name #### sub id { my $self = shift; my $type = lc(shift); return $self->{name} unless ($self->{name} =~ /\|/); # ret name if not NCBI if (! $type || $type eq "gi") { return $1 if ($self->{name} =~ /gi\|([\d]+)/); } # on failure, just return the name return $self->{name}; } sub desc { my $self = shift; return $self->{desc}; } sub score { my $self = shift; return $self->{score}; } sub bits { my $self = shift; return $self->{bits}; } sub expect { my $self = shift; return $self->{expect}; } sub length { my $self = shift; return $self->{length}; } sub length_aln { my $self = shift; return $self->{length_aln}; } sub tot_query_aln { my $self = shift; return $self->{tot_query_aln}; } sub tot_hit_aln { my $self = shift; return $self->{tot_hit_aln}; } sub matches { my $self = shift; my $type = shift; if (lc($type) eq 'id') { return $self->{hitId}; } elsif (lc($type) eq 'cons') { return $self->{hitCons}; } else { return undef; } } sub frac_aligned_query { my $self = shift; my $type = shift; if (lc($type) eq "tot") { return ($self->{tot_query_aln} / $self->{parent}->length_query()); } else { return ($self->{length_aln} / $self->{parent}->length_query()); } } sub frac_aligned_hit { my $self = shift; my $type = shift; if (lc($type) eq "tot") { return ($self->{tot_hit_aln} / $self->{length}); } else { return ($self->{length_aln} / $self->{length}); } } sub frac_identical { my $self = shift; my $type = shift; return ($self->{hitId} / $self->{length_aln}); } sub start { my $self = shift; my $type = shift; if (lc($type) eq 'query') { return $self->{qStart}; } elsif (lc($type) eq 'sbjct') { return $self->{hStart}; } else { return undef; } } sub end { my $self = shift; my $type = shift; if (lc($type) eq 'query') { return $self->{qEnd}; } elsif (lc($type) eq 'sbjct') { return $self->{hEnd}; } else { return undef; } } sub range { my $self = shift; my $type = shift; if (lc($type) eq 'query') { return ($self->{qStart}, $self->{qEnd}); } elsif (lc($type) eq 'sbjct') { return ($self->{hStart}, $self->{hEnd}); } else { return undef; } } sub seekNextHsp { my $self = shift; my $start = shift; $start = $self->{curLine} unless ($start); my $i; my $line; my $isLast = undef; # loop from our start position to end-of-array # bail if we find the next HSP ("Score") for ($i=$start+1 ; $i < $self->{numLines} ; $i++) { $line = $self->{hitText}[$i]; last if ($line =~ /Score /); } # if we didn't go past the end-of-array, return the HSP pos # otherwise return last line pos & set the last flag if ($i < $self->{numLines}) { return ($i, 0); } else { return ($i, 1); } } sub nextHsp { my $self = shift; my $start; my $end; my $isLast; # bail if we're done if ($self->{noMoreHsps}){ #reset $self->{noMoreHsp} which is used to control end of loop $self->{noMoreHsps} = 0; return undef; } # find the start & end of this HSP ($start, $isLast) = $self->seekNextHsp(); ($end, $isLast) = $self->seekNextHsp($start); # return the HSP if we've got it if ($start && $end) { $end--; @hsp = @{$self->{hitText}}[$start..$end]; $self->{curLine} = $end; $self->{noMoreHsps} = 1 if ($isLast); return new Hsp(@hsp); } else { return undef; } } # loop over all HSPs to make the correct tiling sub calcTiledRanges { my $self = shift; my $oldCurLine = $self->{curLine}; while (my $hsp = $self->nextHsp()) { my @qrange = $hsp->range('query'); my @hrange = $hsp->range('sbjct'); &mergeRanges(\$self->{queryRanges}, @qrange); &mergeRanges(\$self->{hitRanges}, @hrange); } # sort & remove deletes $self->{queryRanges} = &cleanRanges($self->{queryRanges}); $self->{hitRanges} = &cleanRanges($self->{hitRanges}); # calc total lengths $self->{tot_query_aln} = &sumRanges($self->{queryRanges}); $self->{tot_hit_aln} = &sumRanges($self->{hitRanges}); $self->{curLine} = $oldCurLine; # rewind } sub mergeRanges { my $oldRanges = shift; my @newRange = @_; my $newStart = $newRange[0]; my $newEnd = $newRange[1]; # if this is first one, push on stack unless (@{$$oldRanges}) { push (@{$$oldRanges}, \@newRange); return; } # otherwise review all hit ranges # basic alg is: if overlap, merge (ie, extend to new range) # otherwise push new range on stack # if overlap with multiple, merge the old ranges my $startMerge; my $endMerge; my $skipFlag = 0; foreach $range (@{$$oldRanges}) { my $oldStart = $$range[0]; my $oldEnd = $$range[1]; next if (!$oldStart && !$oldEnd); # skip if "deleted" if (&between($newStart, $oldStart, $oldEnd)) { if (&between($newEnd, $oldStart, $oldEnd)) { $skipFlag = 1; last; # skip, we are enclosed in another HSP } else { $$range[1] = $newEnd; # extend towards new end $endMerge = $range; } } else { if (&between($newEnd, $oldStart, $oldEnd)) { $$range[0] = $newStart; # extend towards new start $startMerge = $range; } else { # check we don't enclose this HSP ourselves! # if we do, then replace it with ourselves if ((&between($oldStart, $newStart, $newEnd)) && (&between($oldEnd, $newStart, $newEnd))) { $$range[0] = $newStart; $$range[1] = $newEnd; $skipFlag = 1; last; # skip, we enclosed this HSP } else { # no overlap == no action just yet, will push later } } } } # if we merged on both sides, merge the merges # "delete" (set to zero) one copy if ($startMerge && $endMerge) { $$startMerge[0] = $$endMerge[0]; $$endMerge[0] = 0; $$endMerge[1] = 0; } # otherwise if we got no overlaps push our range on the stack if (!$skipFlag && !$startMerge && !$endMerge) { push (@{$$oldRanges}, \@newRange); return; } } sub cleanRanges { my $ranges = shift; my $clean; foreach my $range (@{$ranges}) { next if (! $$range[0] && ! $$range[1]); push(@{$clean}, $range); } @{$clean} = sort {$$a[0] <=> $$b[0]} @{$clean}; return $clean; } sub sumRanges { my $ranges = shift; my $len = 0; foreach my $range (@{$ranges}) { $len += ($$range[1] - $$range[0] + 1); } return $len; } # true if first number is between second & third numbers # endpoints are included sub between { my $x = shift; my $low = shift; my $high = shift; return 1 if (($x >= $low) && ($x <= $high)); return 0; }