Source code for rna_tools.tools.clanstix.rna_clanstix

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""rna_clanstix - a tool for visualizing RNA 3D structures based on pairwise structural similarity with Clans.

We hacked Clans thus instead of BLAST-based distances between sequences, you can analyze distances between structures described as p-values of rmsd (based on the method from the Dokholyan lab.)

Quickref::

      rna_clanstix.py --groups-auto 10 --color-by-homolog --shape-by-source  thf_ref_mapping_pk_refX.txt input2.clans

Running Clans:
To run CLANS you need to have Java 1.4 or better installed (java can be downloaded HERE). For full functionality you will also need the NCBI BLAST,PSI-BLAST and formatdb executables (NCBI). For command line parameters and basic help please refer to the README file.
(source: http://www.eb.tuebingen.mpg.de/research/departments/protein-evolution/software/clans.html)

.. image:: ../../rna_tools/tools/clanstix/doc/yndSrLTb7l.gif

The RMSDs between structures are converted into p-values based on the method from the Dokholyan lab or some hacky way developed by mmagnus .

Color groups
---------------------------------------
You can color your groups:

.. image:: ../../rna_tools/tools/clanstix/doc/rna_clanstix.png

To get colors, run a cmd like this::

   rna_clastix.py rnapz17_matrix_farfar_HelSeedCst.txt --groups 20:seq1+20+20+20+20+20+20:seq10

where with the ``+`` sign you separate groups. Each group has to have a number of structures. Optionally it can have a name, e.g., ``20:seq1``, use ``:`` as a separator. If a provided name is ``native`` then this group will be shown as starts.

Get inspiration for more colors (http://www.rapidtables.com/web/color/RGB_Color.htm)

How to use ClanstixRNA?
----------------------------------------

1. Get a matrix of distances, save it as e.g. matrix.txt (see Comment below)
2. run ClanstixRNA on this matrix to get an input file to Clans (e.g. clans_rna.txt)::

     rna_clanstix.py test_data/matrix.txt # clans.input will be created by default

3. open CLANS and click File -> Load run and load clans_run.txt
4. You're done! :-)

Comment: To get this matrix you can use for example another tool from the rna-pdb-tools packages::

     rna_calc_rmsd_all_vs_all.py -i rp18 -o rp18_rmsd.csv
     rna_clastix.py --groups 1:native+5:3dRNA+
           5:Chen+3:Dokh+5:Feng+5:LeeASModel+
           5:Lee+5:RNAComposer+10:RW3D+5:Rhiju+
           1:YagoubAli+3:SimRNA  rp18_rmsd.csv clans.in

     rna_clastix.py --groups 100+100+100+100+100+100+100+100+100+100+1:native  rp18_rmsd.csv

where ``rp18`` is a folder with structure and ``rp18_rmsd.csv`` is a matrix of all-vs-all rmsds.

.. image:: ../../rna_tools/tools/clanstix/doc/rp18_clanstix.png

Hajdin, C. E., Ding, F., Dokholyan, N. V, & Weeks, K. M. (2010). On the significance of an RNA tertiary structure prediction. RNA (New York, N.Y.), 16(7), 1340–9. doi:10.1261/rna.1837410

An output of this tool can be viewed using CLANS.

Frickey, T., & Lupas, A. (2004). CLANS: a Java application for visualizing protein families based on pairwise similarity. Bioinformatics (Oxford, England), 20(18), 3702–4. doi:10.1093/bioinformatics/bth444
"""

from __future__ import print_function
import argparse
import rna_tools.tools.rna_prediction_significance.rna_prediction_significance as pv
import numpy as np
import math
import logging
import time

logging.basicConfig(level=logging.INFO,
                format='%(message)s',
                datefmt='%m-%d %H:%M',
                filename='rna_clanstix.log',
                filemode='w')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

log = logging.getLogger()
log.setLevel(logging.INFO)


[docs]class RNAStructClans: """Clans run. Usage:: >>> f = open('matrix.txt') >>> ids = f.readline().replace('#','').split() >>> c = RNAStructClans(n=len(ids)) # 200? >>> c.add_ids(ids) >>> c.dist_from_matrix(f) >>> print(c.txt) """ def __init__(self, n=10, dotsize=10): self.n = n self.comment = '' #cluster2d=false self.txt = """sequences=%i <param> maxmove=0.1 pval=%s usescval=false complexatt=true cooling=1.0 currcool=1.0 attfactor=10.0 attvalpow=1 repfactor=10.0 repvalpow=1 dampening=1.0 minattract=1.0 cluster2d=true blastpath='' formatdbpath='' showinfo=false zoom=0.9 dotsize=%s ovalsize=10 groupsize=4 usefoldchange=false avgfoldchange=false colorcutoffs=0.0;0.1;0.2;0.3;0.4;0.5;0.6;0.7;0.8;0.9; colorarr=(230;230;230):(207;207;207):(184;184;184):(161;161;161):(138;138;138):(115;115;115):(92;92;92):(69;69;69):(46;46;46):(23;23;23): </param>""" % (n, args.pvalue, str(dotsize))
[docs] def add_ids(self, ids): t = '\n<seq>\n' for i in ids: t += '>' + i + '\n' t += 'X' + '\n' t += '</seq>\n' if len(ids) != self.n: # print 'n != ids' raise Exception('n != ids') self.txt += t
[docs] def dist_from_matrix(self, rmsds, matrix=0, use_pv=False, dont_calc=False, debug=False): if dont_calc: print('Everything but the dists are generated. Use it to edit the original clans input file.') return # for some hardcore debugging ;-) t = '\n<hsp>\n' c = 0 c2 = 0 if debug: print(rmsds) for row in rmsds: for rmsd in row: if c != c2: if use_pv: dist = pv.get_p_value(rmsd, 1 * 38)[0] # r.get_rmsd_to(r2), 3) else: # 1e-06 10-1 = 9 10-10 0 dist = '1.0E-' + str(int(math.floor(matrix.max()) - int(float(rmsd)))) t += str(c) + ' ' + str(c2) + ':' + str(dist) + '\n' c2 += 1 c2 = 0 c += 1 t += '</hsp>\n' max = math.ceil(matrix.max()) min = matrix[matrix>0].min() self.comment = '# max: %f min (non-zero): %f\n' % (max, min) self.comment += '# 1/4 ' + str((max - min) / 4) + ' ' + str(round((max - min) / 4, 0)) + '\n' self.comment += '# 1/2 ' + str((max - min) / 2) + ' ' + str(round((max - min) / 2, 0)) + '\n' self.comment += '# 1/3 ' + str(((max - min) / 4 ) * 3 ) + ' ' + str(round(((max - min) / 4) * 3, 0)) + '\n' for i in range(1,20): self.comment += '# connected points with RMSD lower than %iA 1.0E-%i\n' % (i, math.ceil(matrix.max()) - i) # 1E-11 = 0 # 1E-10 = 1-0 # 1E-9 = 2-1 self.txt += t
[docs] def dist_from_matrix_mp(self, output_pmatrix_fn, max, min, lines, pmat=False, use_pv=False, debug=False): if debug: print('Everything but the dists are generated. Use it to edit the original clans input file.') return # for some hardcore debugging ;-) t = '\n<hsp>\n' myp = '' c = 0 c2 = 0 for l in lines: for rmsd in l: if c != c2: if use_pv: dist = pv.get_p_value(rmsd, 1 * 38)[0] # r.get_rmsd_to(r2), 3) else: # 1e-06 10-1 = 9 10-10 0 dist = '1.0E-' + str(int(math.floor(matrix.max()) - int(float(rmsd)))) t += str(c) + ' ' + str(c2) + ':' + str(dist) + '\n' myp += ' ' + str(dist) else: myp += ' ' + '0.0' c2 += 1 myp += '\n' c2 = 0 c += 1 t += '</hsp>\n' max = math.ceil(matrix.max()) min = matrix[matrix>0].min() self.comment = '# max: %f min (non-zero): %f\n' % (max, min) self.comment += '# 1/4 ' + str((max - min) / 4) + ' ' + str(round((max - min) / 4, 0)) + '\n' self.comment += '# 1/2 ' + str((max - min) / 2) + ' ' + str(round((max - min) / 2, 0)) + '\n' self.comment += '# 1/3 ' + str(((max - min) / 4 ) * 3 ) + ' ' + str(round(((max - min) / 4) * 3, 0)) + '\n' for i in range(1,20): self.comment += '# connected points with RMSD lower than %iA 1.0E-%i\n' % (i, math.ceil(matrix.max()) - i) # 1E-11 = 0 # 1E-10 = 1-0 # 1E-9 = 2-1 self.txt += t if pmat: with open(output_pmatrix_fn, 'w') as f: f.write(myp) return t
[docs]def check_symmetric(a, rtol=1e-05, atol=1e-08): """ https://stackoverflow.com/questions/42908334/checking-if-a-matrix-is-symmetric-in-numpy """ return np.allclose(a, a.T, rtol=rtol, atol=atol)
[docs]def get_parser(): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('matrixfn', help="matrix") parser.add_argument('--groups-auto', help="define into how many groups make automatically", type=int) parser.add_argument('--color-by-homolog', help="color the same homolog in the same way", action='store_true') parser.add_argument('--one-target', help="color the same homolog in the same way", action='store_true') parser.add_argument('--shape-by-source', help="shape points based on source, SimRNA vs Rosetta (Rosetta models have 'min' in name')", action='store_true') parser.add_argument('--debug', action='store_true') parser.add_argument('--groups-dot-size', type=int, default=8) parser.add_argument('--dont-calc', action='store_true', help="A simple and dirty trick to get " "generates everything but the main distances from the matrix" "useful if you want to run the script to generate different settings, such as" "colors, groups etc. Run the script and then replace parts of the original " "file with the matrix") parser.add_argument('--groups', help="groups, at the moment up to 7 groups can be handle" "--groups 1:native+100:zmp+100:zaa+100:zbaa+100:zcp+100:znc" "--groups 1:native+100:nats+100:hom1+100:hom2+100:hom3+100:hom4" "native will light green" "zmp will be forest green" "(easy to be changed in the future)") parser.add_argument('--use-pvalue', action='store_true', help="") parser.add_argument('--pvalue', default="1.0E-15", help="set p-value for clans.input, default: 1.0E-15") parser.add_argument('--output', help="input file for clans, e.g., clans.input", default="clans.input") parser.add_argument('--output-pmatrix', action='store_true', help="p value matrix will be saved, see --output-matrix-fn to define name") parser.add_argument('--output-pmatrix-fn', default="pmatrix.txt", help="filename of output matrix, pmatrix.txt by default") parser.add_argument('--multiprocessing', action='store_true', help="run calculations in parallel way, warning: extra libraries required, see the Docs") return parser
#main if __name__ == '__main__': parser = get_parser() args = parser.parse_args() debug = args.debug # as a short cut later f = open(args.matrixfn) # OK, check if there is a header = the line with '#' headers = f.readline() if headers.strip().startswith('#'): # if yes, then split remove # and split into lists ids = headers.replace('#', '').split() else: # if no, then make a list form [0, # of items in the first line] ids = [str(i) for i in range(0, len(headers.split()))] f.seek(0) # get max logging.info(time.strftime("%Y-%m-%d %H:%M:%S")) # Collect rmsds into a list rmsds = [] for l in f: #for rmsd in map(float,l.split()): # map(float, s.split()) rmsds.append(map(float,l.split())) # warning: # eh, this matrix is not really used by clanstix main engine # it's used to calc min and max value of a matrix matrix = np.loadtxt(args.matrixfn) if check_symmetric(matrix): if args.debug: print('Matrix is symmetrical! ', matrix.shape) else: raise Exception('Matrix is not symmetrical! Check your matrix', matrix.shape) # keep dot quite big by default, # but if you use dotsize then keep this one 0 dotsize = 8 if args.groups_auto or args.groups: dotsize = 0 c = RNAStructClans(n=len(ids), dotsize=dotsize) # 200? c.add_ids(ids) if debug: print('dist_from_matrix...') if args.multiprocessing: import multiprocessing as mp #import dill #import parmap from pathos.multiprocessing import ProcessingPool matrix = np.loadtxt(args.matrixfn) max = int(math.floor(matrix.max())) min = matrix[matrix>0].min() clans_list_of_pvalues = '' pool = ProcessingPool(mp.cpu_count()) x=pool.map(c.dist_from_matrix_mp, [args.output_pmatrix_fn], [max], [min], [rmsds], [args.output_pmatrix], ) pool.close() for i in x: clans_list_of_pvalues = clans_list_of_pvalues.join(i) else: c.dist_from_matrix(rmsds, matrix, args.use_pvalue, args.dont_calc, args.debug) if debug: print('process the matrix...') # # DEFINE GROUPS # # 1+20+20+20+20+20 seqgroups = '' #colors = ['0;255;102;255', # ligthgreen 1 # '0;102;0;255', # forest 2 # A list of colors used later .... colors = [ '255;102;102;255', # red 3 '51;51;255;255', # blue 4 '0;255;255;255', # light blue +1 '180;38;223;255', # violet 5 '64;64;64;255', # grey 6 '255;128;0;255', # orange 7 '240;230;140;255', #khaki '210;105;30;255', # chocolate '0;255;255;255', # cyjan '128;0;128;255', # purple 8 '0;128;128;255', # Teal 9 '128;0;0;255', # maroon 10 ] # This is pretty much the same list as above, but in here I have more distinguishable colors, # as I should be used with less number of groups colors_homologs = [ '255;102;102;255', # red 3 '51;51;255;255', # blue 4 '64;64;64;255', # grey 6 '128;0;128;255', # purple 8 '128;0;0;255', # maroon 10 '0;255;255;255', # cyjan '237;41;57;255', # red #'210;105;30;255', # chocolate ] # OK, this is the trick # if args.groups_auto is on, then you built args.groups automatically, # and then just simply run the next if as it was args.groups in the arguments # pretty cool ;-) if args.groups_auto: # arsg_groups = '' from collections import OrderedDict # collect groups #groups = [] groups = OrderedDict() for i in ids: # # collect homologs gmp_ # simrna and farna print(i) group_name = i[:args.groups_auto] if group_name in groups: groups[group_name] += 1 else: groups[group_name] = 1 groups_str = '' for g in groups: groups_str += str(groups[g]) + ':' + g + '+' # # this is the trick, you args.groups = groups_str[:-1] print(groups_str) # change this to get 1:hccp+10:zmp+10:xrt if args.groups: # this is a manual way how to do it groups = args.groups.split('+') seqgroups = '<seqgroups>\n' curr_number = 0 homologs_colors = {} if args.debug: print(args.groups) for index, group in enumerate(groups): # parse groups # type and size will be changed for native size = args.groups_dot_size dottype = 0 color = '' # use for diff color selection if tar if ':' in group: nstruc, name = group.split(':') if name == 'native': dottype = 8 size = 20 color = '0;128;128;255' # olive if 'solution' in name: dottype = 8 size = 30 # solution is bigger color = '0;128;128;255' # olive if args.one_target: # target is target, dont distinguish it if 'tar' in name and 'tar_min' not in name: # SimRNA dottype = 7 size = size # 7 # size of SimRNA reference seq points color = '0;255;102;255' # ligthgreen 1 if 'tar_min' in name: dottype = 9 size = size # 7 # size of Rosetta reference seq points color = '0;102;0;255' # forest 2 else: if 'tar' in name: # SimRNA size = size #8 # size of SimRNA reference seq points dottype = 0 color = '0;255;102;255' # ligthgreen 1 if args.shape_by_source: # Rosetta models are diamond now if 'min' in name: dottype = 2 # color by homolog if args.color_by_homolog: # 10:gxx_6bd266+10:gxx_min.ou+10:gbaP_d2b57+10:gbaP_min.o+10:gbx_00de79+10:gbx_min.ou+10:gapP_d9d22+10:gapP_min.o+10:gmp_faa97e+10:gmp_min.ou tmp = name.split('_') homolog_name = tmp[0] if debug: 'homolog_name', homolog_name # ignore tar and solution, their colors are defined above ^ ## [mm] simrna5x100farna5x100$ git:(master) ✗ rna_clastix.py --groups-auto 8 --color-by-homolog --shape-by-source rp14sub_ref_mapping_refX.txt input2.clans --debug ## 2019-01-09 12:23:21 ## 100:tar_min.+100:tar_rp14+1:solution+100:cy2_min.+100:cy2_r14a+100:aj6_min.+100:aj6_r14a ## {'cy2': '210;105;30;255'} ## {'cy2': '210;105;30;255'} ## {'cy2': '210;105;30;255', 'aj6': '0;255;255;255'} ## {'cy2': '210;105;30;255', 'aj6': '0;255;255;255'} ## # max: 18.000000 min (non-zero): 0.810000 if homolog_name == 'tar' or homolog_name.startswith('solution'): pass else: if homolog_name in homologs_colors: color = homologs_colors[homolog_name] else: homologs_colors[homolog_name] = colors_homologs.pop() color = homologs_colors[homolog_name] if debug: print(homologs_colors) else: nstruc = group name = 'foo' # color is set in the args.color_by_homologs # craft seqgroups seqgroups += "name=%s\n" % name seqgroups += "type=%i\n" % dottype # color hack if color: # this color is fixed for native right now seqgroups += "color=%s\n" % color else: seqgroups += "color=%s\n" % colors[index] # color hack seqgroups += "size=%i\n" % size seqgroups += "hide=0\n" if debug: print('name: ' + name + ' ' + colors[index]) # get numbers - convert nstruc into numbers in Clans format 0;1; etc. # remember: it starts from 0 # --groups 1:hccp+10:zmp+10:xrt # hccp # [0] # zmp # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # xrt # [11, 12, 13, 14, 15, 16, 17, 18, 19, 20] numbers = range(curr_number, curr_number + int(nstruc)) # 0, 1 curr_number = curr_number + int(nstruc) seqgroups += "numbers=%s;\n" % ';'.join([str(number) for number in numbers]) seqgroups += '</seqgroups>\n' with open(args.output, 'w') as f: f.write(c.txt) if args.multiprocessing: f.write(clans_list_of_pvalues) f.write(seqgroups) f.write(c.comment) # if debug: print(c.txt) print(c.comment) logging.info(time.strftime("%Y-%m-%d %H:%M:%S")) if debug: print(seqgroups)