Source code for dgenies.bin.sort_paf

#!/usr/bin/env python3

import os
from math import sqrt

__NAME__ = "Sort_paf"
__VERSION__ = 0.1


[docs]class Sorter: """ Sort PAF file by match size """ def __init__(self, input_f, output_f): """ :param input_f: input fasta file path :type input_f: str :param output_f: output fasta file path :type output_f: str """ self.input_f = input_f self.output_f = output_f
[docs] def sort(self): """ Launch sort staff """ paf_lines = self._get_sorted_paf_lines() with open(self.output_f, "w") as out: out.write("\n".join(["\t".join(x[:-1]) for x in paf_lines]))
def _sort_lines(self, lines): """ Sort lines staff :param lines: lines of PAF file to be sorted :type lines: _io.TextIO :return: sorted lines :rtype: list """ paf_lines = [] nb_lines = 0 min_len = 0 for line in lines: parts = line.strip("\n").split("\t") len_line = sqrt(pow(int(parts[3]) - int(parts[2]), 2) + pow(int(parts[8]) - int(parts[7]), 2)) * (int(parts[9]) / int(parts[10])) parts.append(len_line) nb_lines += 1 if nb_lines <= 1000000: paf_lines.append(parts) if nb_lines == 1000000: print("Sorting lines...") paf_lines.sort(key=lambda x: -x[-1]) with open(self.input_f, 'r') as paf: num_lines = sum(1 for line in paf) pct_to_keep = 2000000 / num_lines limit = int(nb_lines * pct_to_keep) paf_lines = paf_lines[:limit] min_len = paf_lines[-1][-1] print("Sorting done!") elif len_line >= min_len: print("Processing line %d..." % nb_lines) paf_lines.append(parts) paf_lines.sort(key=lambda x: -x[-1]) return paf_lines def _get_sorted_paf_lines(self): """ Get sorted PAF :return: sorted PAF lines """ with open(self.input_f, "r") as lines: paf_lines = self._sort_lines(lines) return paf_lines
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description="Sort PAF file with size of matches (DESC)") parser.add_argument('-i', '--input', type=str, required=True, help="Input PAF file") parser.add_argument('-o', '--output', type=str, required=True, help="Output PAF file") args = parser.parse_args() if args["--version"]: print(__NAME__, __VERSION__) else: if not os.path.exists(args.input): raise Exception("Input PAF file %s does not exists" % args.input) sorter = Sorter(args.input, args.output) sorter.sort()