Source code for dgenies.lib.paf

#!/usr/bin/env python3

from dgenies import MODE

import os
import shutil
from math import sqrt
from numpy import mean
from pathlib import Path
import json
from dgenies.bin.index import Index
from dgenies.config_reader import AppConfigReader
from dgenies.lib.functions import Functions
from intervaltree import IntervalTree
import matplotlib as mpl
mpl.use('Agg')
from matplotlib import pyplot as plt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


[docs]class Paf: """ Functions applied to PAF files """ config = AppConfigReader() limit_idy = [0.25, 0.5, 0.75] max_nb_lines = config.max_nb_lines def __init__(self, paf: str, idx_q: str, idx_t: str, auto_parse: bool=True, mailer=None, id_job=None): """ :param paf: PAF file path :type paf: str :param idx_q: query index file path :type idx_q: str :param idx_t: target index file path :type idx_t: str :param auto_parse: if True, parse PAF file at initialisation :type auto_parse: bool :param mailer: mailer object, to send mails :type mailer: Mailer :param id_job: job id :type id_job: str """ self.paf = paf self.idx_q = idx_q self.idx_t = idx_t self.sorted = False self.data_dir = os.path.dirname(paf) if os.path.exists(os.path.join(self.data_dir, ".sorted")): self.paf += ".sorted" self.idx_q += ".sorted" self.sorted = True self.sampled= False self.len_q = None self.len_t = None self.min_idy = None self.max_idy = None self.lines = {} self.q_contigs = {} self.q_order = [] self.t_contigs = {} self.t_order = [] self.q_reversed = {} self.name_q = None self.name_t = None self.parsed = False self.error = False self.mailer = mailer self.id_job = id_job self.q_abs_start = {} self.t_abs_start = {} if auto_parse: self.parse_paf() @staticmethod def _flush_blocks(index_c, new_index_c, new_index_o, current_block): """ When parsing index, build a mix of too small sequential contigs (if their number exceed 5), else just add co to the new index :param index_c: current index contigs def :type index_c: dict :param new_index_o: new index contigs order :type new_index_o: list :param new_index_c: new index contigs def :type new_index_c: dict :param current_block: contigs in the current analyzed block :type current_block: list :return: (new index contigs defs, new index contigs order) :rtype: (dict, list) """ if len(current_block) >= 5: block_length = 0 for contig in current_block: block_length += index_c[contig] b_name = "###MIX###_" + "###".join(current_block) new_index_c[b_name] = block_length new_index_o.append(b_name) elif len(current_block) > 0: for b_name in current_block: new_index_c[b_name] = index_c[b_name] new_index_o.append(b_name) return new_index_c, new_index_o
[docs] def parse_index(self, index_o: list, index_c: dict, full_len: int): """ Parse index and merge too small contigs together :param index_o: index contigs order :type index_o: list :param index_c: index contigs def :type index_c: dict :param full_len: length of the sequence :type full_len: int :return: (new contigs def, new contigs order) :rtype: (dict, list) """ new_index_o = [] new_index_c = {} current_block = [] for index in index_o: if index_c[index] >= 0.002 * full_len: new_index_c, new_index_o = self._flush_blocks(index_c, new_index_c, new_index_o, current_block) current_block = [] new_index_c[index] = index_c[index] new_index_o.append(index) else: current_block.append(index) new_index_c, new_index_o = self._flush_blocks(index_c, new_index_c, new_index_o, current_block) return new_index_c, new_index_o
[docs] @staticmethod def remove_noise(lines, noise_limit): """ Remove noise from the dot plot :param lines: lines of the dot plot, by class :type lines: dict :param noise_limit: line length limit :type noise_limit: float :return: kept lines, by class :rtype: dict """ keep_lines = { "0": [], "1": [], "2": [], "3": [] } for cls, c_lines in lines.items(): for line in c_lines: x1 = line[0] x2 = line[1] y1 = line[2] y2 = line[3] idy = line[4] len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2)) if len_m >= noise_limit: keep_lines[cls].append(line) return keep_lines
[docs] def keyerror_message(self, exception, type_f): """ Build message if contig not found in query or target :param exception: exception object :type exception: KeyError :param type_f: type of data (query or target) :type type_f: str :return: error message :rtype: str """ message = "Invalid contig for %s: %s" % (type_f, exception.args[0]) if os.path.exists(os.path.join(self.data_dir, ".align")): message += ". May be you invert query and target files?" return message
[docs] def parse_paf(self, merge_index=True, noise=True): """ Parse PAF file :param merge_index: if True, merge too small contigs in index :type merge_index: bool :param noise: if True, remove noise :type noise: bool """ min_idy = 10000000000 max_idy = -10000000000 lines = { "0": [], # idy < 0.25 "1": [], # idy < 0.5 "2": [], # idy < 0.75 "3": [] # idy > 0.75 } try: name_q, q_order, q_contigs, q_reversed, q_abs_start, len_q = Index.load(self.idx_q) self.q_abs_start = q_abs_start if merge_index: q_contigs, q_order = self.parse_index(q_order, q_contigs, len_q) except IOError: self.error = "Index file does not exist for query!" return False try: name_t, t_order, t_contigs, t_reversed, t_abs_start, len_t = Index.load(self.idx_t) self.t_abs_start = t_abs_start if merge_index: t_contigs, t_order = self.parse_index(t_order, t_contigs, len_t) except IOError: self.error = "Index file does not exist for target!" return False lines_lens = [] try: with open(self.paf, "r") as paf_file: nb_lines = 0 for line in paf_file: nb_lines += 1 if nb_lines > self.max_nb_lines: self.sampled = True break parts = line.strip("\n").split("\t") v1 = parts[0] v6 = parts[5] strand = 1 if parts[4] == "+" else -1 idy = int(parts[9]) / int(parts[10]) min_idy = min(min_idy, idy) max_idy = max(max_idy, idy) # x1, x2, y1, y2, idy try: y1 = int(parts[2]) + q_abs_start[v1] y2 = int(parts[3]) + q_abs_start[v1] except KeyError as e: self.error = self.keyerror_message(e, "query") return False try: x1 = int(parts[7 if strand == 1 else 8]) + t_abs_start[v6] x2 = int(parts[8 if strand == 1 else 7]) + t_abs_start[v6] except KeyError as e: self.error = self.keyerror_message(e, "target") return False len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2)) lines_lens.append(len_m) if idy < self.limit_idy[0]: class_idy = "0" elif idy < self.limit_idy[1]: class_idy = "1" elif idy < self.limit_idy[2]: class_idy = "2" else: class_idy = "3" lines[class_idy].append([x1, x2, y1, y2, idy, v1, v6]) except IOError: self.error = "PAF file does not exist!" return False if not noise and nb_lines > 1000: counts, bins, bars = plt.hist(lines_lens, bins=nb_lines//10) counts = list(counts) max_value = max(counts) max_index = counts.index(max_value) limit_index = -1 for i in range(max_index, len(counts)): if counts[i] < max_value / 50: limit_index = i break if limit_index > -1: lines = self.remove_noise(lines, bins[limit_index]) self.parsed = True self.len_q = len_q self.len_t = len_t self.min_idy = min_idy self.max_idy = max_idy self.lines = lines self.q_contigs = q_contigs self.q_order = q_order self.q_reversed = q_reversed self.t_contigs = t_contigs self.t_order = t_order self.name_q = name_q self.name_t = name_t
[docs] def get_d3js_data(self): """ Build data for D3.js client :return: data for d3.js: * y_len: length of query (Bp) * x_len: length of target (Bp) * min_idy: minimum of identity (float) * max_idy: maximum of identity (float) * lines: matches lines, by class of identity (dict) * y_contigs: query contigs definitions (dict) * y_order: query contigs order (list) * x_contigs: target contigs definitions (dict) * x_order: target contigs order (list) * name_y: name of the query (str) * name_x: name of the target (str) * limit_idy: limit for each class of identities (list) :rtype: dict """ return { 'y_len': self.len_q, 'x_len': self.len_t, 'min_idy': self.min_idy, 'max_idy': self.max_idy, 'lines': self.lines, 'y_contigs': self.q_contigs, 'y_order': self.q_order, 'x_contigs': self.t_contigs, 'x_order': self.t_order, 'name_y': self.name_q, 'name_x': self.name_t, 'limit_idy': self.limit_idy, 'sorted': self.sorted, 'sampled': self.sampled, "max_nb_lines": self.max_nb_lines, }
[docs] def save_json(self, out): """ Save D3.js data to json :param out: output file path :type out: str """ import json data = self.get_d3js_data() with open(out, "w") as out_f: out_f.write(json.dumps(data))
[docs] def is_contig_well_oriented(self, lines, contig, chrom): """ Returns True if the contig is well oriented. A well oriented contig must have y increased when x increased. We check that only for highest matches (small matches must be ignored) :param lines: lines inside the contig :type lines: list :param contig: query contig name :type contig: str :param chrom: target chromosome name :type chrom: str :return: True if well oriented, False else :rtype: bool """ lines.sort(key=lambda x: -x[-1]) max_len = lines[0][-1] i = 0 # Select samples of tested lines such that: # - a selected line is at least 10% of the longest line # - and a selected line is at least 1% of the length of contig or the length of chrom while i < len(lines) and lines[i][-1] > max_len * 0.1 \ and lines[i][-1] >= 0.01 * min(self.q_contigs[contig], self.t_contigs[chrom]): i += 1 selected_lines = lines[:i] # Check orientation of each line: if len(selected_lines) > 1: selected_lines.sort(key=lambda x: x[0]) orients = [] for i in range(1, len(selected_lines)): if selected_lines[i][2] > selected_lines[i-1][2]: orients.append(1) else: orients.append(-1) if mean(orients) > -0.1: # We have a good orientation (-0.1 to ignore ambiguous cases) return True elif len(selected_lines) == 1: orient_first = "+" if selected_lines[0][3] < selected_lines[0][4] else "-" # Get X orientation if selected_lines[0][-2 if orient_first == "+" else -3] > \ selected_lines[0][-3 if orient_first == "+" else -2]: # Check Y according to X orientation return True elif len(selected_lines) == 0: # None line were selected: we ignore this block return True # In all other cases the orientation is wrong: return False
[docs] def reorient_contigs_in_paf(self, contigs): """ Reorient contigs in the PAF file :param contigs: contigs to be reoriented """ sorted_file = self.paf + ".sorted" contigs = set(contigs) with open(self.paf, "r") as source, open(sorted_file, "w") as target: for line in source: parts = line.strip("\n").split("\t") if parts[0] in contigs: len_q = int(parts[1]) x1_q = int(parts[2]) x2_q = int(parts[3]) if parts[4] == "-": parts[4] = "+" else: parts[4] = "-" parts[2] = str(len_q - x2_q) parts[3] = str(len_q - x1_q) target.write("\t".join(parts) + "\n") if self.paf.endswith(".sorted"): os.remove(self.paf) shutil.move(sorted_file, self.paf) else: self.paf = sorted_file return True
def _update_query_index(self, contigs_reoriented): """ Write new query index file (including new reoriented contigs info) :param contigs_reoriented: reoriented contigs list :type contigs_reoriented: list """ contigs_reoriented = set(contigs_reoriented) with open(self.idx_q, "w") as idx: idx.write(self.name_q + "\n") for contig in self.q_order: idx.write("\t".join([contig, str(self.q_contigs[contig]), "1" if contig in contigs_reoriented else "0"]) + "\n")
[docs] def set_sorted(self, is_sorted): """ Change sorted status :param is_sorted: new sorted status :type is_sorted: bool """ self.sorted = is_sorted sorted_touch = os.path.join(os.path.dirname(self.paf), ".sorted") if is_sorted: Path(sorted_touch).touch() else: if os.path.exists(sorted_touch): os.remove(sorted_touch)
[docs] def compute_gravity_contigs(self): """ Compute gravity for each contig on each chromosome (how many big matches they have). Will be used to find which chromosome has the highest value for each contig :return: * [0] gravity for each contig and each chromosome: {contig1: {chr1: value, chr2: value, ...}, contig2: ...} * [1] For each block save lines inside: [median_on_query, squared length, median_on_target, x1, x2, y1, y2, length] (x : on target, y: on query) """ gravity_contig = {} lines_on_block = {} # Compute size of blocks (in term of how many big match they have), and save median of each match on each one # (for next step) for line in [j for i in list(self.lines.values()) for j in i]: x1 = int(line[0]) x2 = int(line[1]) y1 = int(line[2]) y2 = int(line[3]) contig = line[5] chrm = line[6] block = (contig, chrm) # X1 and X2 (in good orientation): x_1 = min(x1, x2) x_2 = max(x2, x1) med_q = x_1 + (abs(x_2 - x_1) / 2) # Y1 and Y2 (in good orientation): y_1 = min(y1, y2) y_2 = max(y2, y1) med_t = y_1 + (abs(y_2 - y_1) / 2) len_m = sqrt(pow(x2 - x1, 2) + pow(y2 - y1, 2)) # Len len_m_2 = pow(1 + len_m, 2) # Pow of len if block not in lines_on_block: lines_on_block[block] = [] lines_on_block[block].append((med_q, len_m_2, med_t, x1, x2, y1, y2, len_m)) if contig not in gravity_contig: gravity_contig[contig] = {} if chrm not in gravity_contig[contig]: gravity_contig[contig][chrm] = 0 gravity_contig[contig][chrm] += len_m_2 return gravity_contig, lines_on_block
[docs] def sort(self): """ Sort contigs according to reference target and reorient them if needed """ self.parse_paf(False) sorted_file = self.paf + ".sorted" if not self.sorted: # Do the sort if not self.paf.endswith(".sorted") and not self.idx_q.endswith(".sorted") and \ (not os.path.exists(self.paf + ".sorted") or not os.path.exists(self.idx_q + ".sorted")): gravity_contig , lines_on_block = self.compute_gravity_contigs() # For each contig, find best block, and deduce gravity of contig: gravity_on_contig = {} reorient_contigs = [] for contig, chr_blocks in gravity_contig.items(): # Find best block: max_number = 0 max_chr = None for chrm, size in chr_blocks.items(): if size > max_number: max_number = size max_chr = chrm # Compute gravity of contig: nb_items = 0 sum_items = 0 lines_on_selected_block = lines_on_block[(contig, max_chr)] for med in lines_on_selected_block: sum_items += med[0] * med[1] nb_items += med[1] gravity_on_contig[contig] = sum_items / nb_items # Check if contig must be re-oriented: if len(lines_on_selected_block) > 0: if not self.is_contig_well_oriented(lines_on_selected_block, contig, max_chr): reorient_contigs.append(contig) # Sort contigs: self.q_order.sort(key=lambda x: gravity_on_contig[x] if x in gravity_on_contig else self.len_q + 1000) self.idx_q += ".sorted" with open(self.idx_q, "w") as idx_q_f: idx_q_f.write(self.name_q + "\n") for contig in self.q_order: idx_q_f.write("\t".join([contig, str(self.q_contigs[contig])]) + "\n") # Re-orient contigs: if len(reorient_contigs) > 0: self.reorient_contigs_in_paf(reorient_contigs) else: shutil.copyfile(self.paf, sorted_file) # Update index: self._update_query_index(reorient_contigs) else: self.idx_q += ".sorted" self.set_sorted(True) self.paf = sorted_file else: # Undo the sort self.paf = self.paf.replace(".sorted", "") self.idx_q = self.idx_q.replace(".sorted", "") self.set_sorted(False) # Re parse PAF file: self.parsed = False self.parse_paf()
[docs] def reverse_contig(self, contig_name): """ Reverse contig :param contig_name: contig name :type contig_name: str """ self.parse_paf(False) reorient_contigs = [contig_name] self.reorient_contigs_in_paf(reorient_contigs) if not self.idx_q.endswith(".sorted"): self.idx_q += ".sorted" self._update_query_index(reorient_contigs) self.set_sorted(True) self.parsed = False self.parse_paf()
[docs] def get_query_on_target_association(self, with_coords=True): """ For each query, get the best matching chromosome :return: query on target association :rtype: dict """ gravity_contig, lines_on_block = self.compute_gravity_contigs() query_on_target = {} for contig, chr_blocks in gravity_contig.items(): # Find best block: max_number = 0 max_chr = None min_target = -1 max_target = -1 min_query = -1 max_query = -1 for chrm, size in chr_blocks.items(): if size > max_number: max_number = size max_chr = chrm min_target = -1 max_target = -1 min_query = -1 max_query = -1 if with_coords: for line in lines_on_block[(contig, chrm)]: x1 = min(line[3], line[4]) - self.t_abs_start[chrm] x2 = max(line[3], line[4]) - self.t_abs_start[chrm] if x1 < min_target or min_target == -1: min_target = x1 if x2 > max_target: max_target = x2 y1 = min(line[5], line[6]) - self.q_abs_start[contig] y2 = max(line[5], line[6]) - self.q_abs_start[contig] if y1 < min_query or min_query == -1: min_query = y1 if y2 > max_query: max_query = y2 if max_chr is not None: query_on_target[contig] = (max_chr, min_query, max_query, min_target, max_target) else: query_on_target[contig] = None return query_on_target
[docs] def get_queries_on_target_association(self): """ For each target, get the list of queries associated to it :return: list of queries associated to each target :rtype: dict """ gravity_contig = self.compute_gravity_contigs()[0] queries_on_target = {} for contig, chr_blocks in gravity_contig.items(): # Find best block: max_number = 0 max_chr = None for chrm, size in chr_blocks.items(): if size > max_number: max_number = size max_chr = chrm if max_chr is not None: if max_chr not in queries_on_target: queries_on_target[max_chr] = [] queries_on_target[max_chr].append(contig) return queries_on_target
[docs] def build_query_on_target_association_file(self): """ For each query, get the best matching chromosome and save it to a CSV file. Use the order of queries :return: content of the file """ query_on_target = self.get_query_on_target_association(with_coords=True) content = "Query\tTarget\tStrand\tQ-len\tQ-start\tQ-stop\tT-len\tT-start\tT-stop\n" for contig in self.q_order: strand = "+" if contig in self.q_reversed: strand = "-" if self.q_reversed[contig] else "+" if contig in query_on_target: min_target = str(query_on_target[contig][3]) if min_target == "-1": min_target = "na" max_target = str(query_on_target[contig][4]) if max_target == "-1": max_target = "na" min_query = str(query_on_target[contig][1]) if min_query == "-1": min_query = "na" max_query = str(query_on_target[contig][2]) if max_query == "-1": max_query = "na" chrm = query_on_target[contig][0] content += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (contig, chrm or "None", strand, str(self.q_contigs[contig]), min_query, max_query, str(self.t_contigs[chrm]), min_target, max_target) else: content += "%s\t%s\t%s\t%s\tna\tna\tna\tna\tna\n" % (contig, "None", strand, str(self.q_contigs[contig])) return content
[docs] def build_list_no_assoc(self, to): """ Build list of queries that match with None target, or the opposite :param to: query or target :return: content of the file """ index = self.idx_q if to == "query" else self.idx_t name, contigs_list, contigs, reversed, abs_start, c_len = Index.load(index) contigs_list = set(contigs_list) with open(self.paf, "r") as paf: for line in paf: c_name = line.strip("\n").split("\t")[0 if to == "query" else 5] if c_name in contigs_list: contigs_list.remove(c_name) return "\n".join(contigs_list) + "\n"
def _add_percents(self, percents, item): """ Update percents with interval :param percents: initial percents :type percents: dict :param item: interval from IntervalTree :type item: Interval :return: new percents :rtype: dict """ i_count = item.length() percents[str(item.data)] += i_count percents["-1"] -= i_count return percents def _remove_overlaps(self, position_idy, percents): """ Remove overlaps between matches on the diagonal :param position_idy: matches intervals with associated identity category :type position_idy: IntervalTree :param percents: Percent of matches for each identity category :type percents: dict :return: new percents (updated after overlap removing) :rtype: dict """ while len(position_idy) > 0: item = position_idy.pop() start = item.begin end = item.end cat = item.data overlaps = position_idy.overlap(start, end) if len(overlaps) > 0: has_overlap = False for overlap in overlaps: if has_overlap: break o_start = overlap.begin o_end = overlap.end o_cat = overlap.data if not position_idy.containsi(o_start, o_end, o_cat): continue if start < o_start: if end <= o_end: # cccccccccccccc******* # *****ooooooooo[ooooooo] if o_cat < cat: if end < o_end: # No overlap with the current item, we stay has_overlap as False position_idy.discard(overlap) position_idy[end:o_end] = o_cat else: position_idy.discard(overlap) # No kept overlap elif o_cat == cat: if end < o_end: has_overlap = True position_idy.discard(overlap) position_idy[start:o_end] = cat else: position_idy.discard(overlap) # No kept overlap else: has_overlap = True position_idy.discard(overlap) position_idy[start:o_start] = cat position_idy[o_start:o_end] = o_cat else: # end > o_end # ccccccccccccccccccc # *****oooooooooo**** if o_cat <= cat: position_idy.discard(overlap) # No kept overlap else: # o_cat > cat has_overlap = True position_idy.discard(overlap) position_idy[start:o_start] = cat position_idy[o_start:o_end] = o_cat position_idy[o_end:end] = cat elif start == o_start: if end < o_end: # cccccccccccc******* # ooooooooooooooooooo if o_cat < cat: # No overlap with the current item, we stay has_overlap as False position_idy.discard(overlap) position_idy[end:o_end] = o_cat elif o_cat == cat: has_overlap = True position_idy.discard(overlap) position_idy[start:o_end] = cat else: # o_cat > cat # The overlap just contains current item has_overlap = True elif end == o_end: # ***cccccccccccccccc*** # ***oooooooooooooooo*** if o_cat <= cat: position_idy.discard(overlap) # No kept overlap else: # The overlap just contains current item has_overlap = True else: # end > o_end # ccccccccccccccccccccccccccccc # oooooooooooooooooooo********* if o_cat <= cat: # current item just contains the overlap position_idy.discard(overlap) else: has_overlap = True position_idy.discard(overlap) position_idy[o_start:o_end] = o_cat position_idy[o_end:end] = cat else: # start > o_start if end <= o_end: # ******ccccccccc******* # ooooooooooooooo[ooooooo] if o_cat < cat: has_overlap=True position_idy.discard(overlap) position_idy[o_start:start] = o_cat position_idy[start:end] = cat if end < o_end: position_idy[end:o_end] = o_cat else: # o_cat >= cat # Overlap just contains the item has_overlap = True else: # end > o_end # ******ccccccccccccccccccccc # ooooooooooooooooo********** if o_cat < cat: has_overlap = True position_idy.discard(overlap) position_idy[o_start:start] = o_cat position_idy[start:end] = cat elif o_cat == cat: has_overlap = True position_idy.discard(overlap) position_idy[o_start:end] = cat else: # o_cat > cat has_overlap = True position_idy[o_end:end] = cat if not has_overlap: percents = self._add_percents(percents, item) else: percents = self._add_percents(percents, item) return percents
[docs] def build_summary_stats(self, status_file): """ Get summary of identity :return: table with percents by category """ summary_file = self.paf + ".summary" self.parse_paf(False, False) if self.parsed: percents = {"-1": self.len_t} position_idy = IntervalTree() cats = sorted(self.lines.keys()) for cat in cats: percents[cat] = 0 for line in self.lines[cat]: start = min(line[0], line[1]) end = max(line[0], line[1]) + 1 position_idy[start:end] = int(cat) percents = self._remove_overlaps(position_idy, percents) for cat in percents: percents[cat] = percents[cat] / self.len_t * 100 with open(summary_file, "w") as summary_file: summary_file.write(json.dumps(percents)) os.remove(status_file) return percents shutil.move(status_file, status_file + ".fail") return None
[docs] def get_summary_stats(self): """ Load summary statistics from file :return: summary object or None if summary not already built :rtype: dict """ summary_file = self.paf + ".summary" if os.path.exists(summary_file): with open(summary_file, "r") as summary_file: txt = summary_file.read() return json.loads(txt) return None
[docs] def build_query_chr_as_reference(self): """ Assemble query contigs like reference chromosomes :return: path of the fasta file """ try: if not self.sorted: raise Exception("Contigs must be sorted to do that!") with open(os.path.join(self.data_dir, ".query")) as query_file: query_fasta = query_file.read().strip("\n") if not os.path.isfile(query_fasta): raise Exception("Query fasta does not exists") o_fasta = os.path.join(os.path.dirname(query_fasta), "as_reference_" + os.path.basename(query_fasta)) if o_fasta.endswith(".gz"): o_fasta = o_fasta[:-3] if not os.path.exists(o_fasta): uncompressed = False if query_fasta.endswith(".gz"): uncompressed = True query_fasta = Functions.uncompress(query_fasta) query_f = SeqIO.index(query_fasta, "fasta") contigs_assoc = self.get_queries_on_target_association() mapped_queries = set() with open(o_fasta, "w") as out: for target in self.t_order: if target in contigs_assoc: queries = sorted(contigs_assoc[target], key=lambda x: self.q_order.index(x)) seq = SeqRecord(Seq("")) for query in queries: mapped_queries.add(query) new_seq = query_f[query] if self.q_reversed[query]: new_seq = new_seq.reverse_complement() seq += new_seq seq += 100 * "N" seq = seq[:-100] seq.id = seq.name = seq.description = target SeqIO.write(seq, out, "fasta") for contig in self.q_order: if contig not in mapped_queries: seq = query_f[contig] seq.id += "_unaligned" SeqIO.write(seq, out, "fasta") query_f.close() if uncompressed: os.remove(query_fasta) status = "success" except Exception: o_fasta = "_._" status="fail" if MODE == "webserver": parts = os.path.basename(o_fasta).rsplit(".", 1) Functions.send_fasta_ready(mailer=self.mailer, job_name=self.id_job, sample_name=parts[0], ext=parts[1], compressed=False, path="download", status=status) return o_fasta