Source code for rna_tools.tools.clanstix.rna_clanstix

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""rna_clanstix - a tool for visualizing RNA 3D structures based on pairwise structural similarity with Clans.

We hacked Clans thus instead of BLAST-based distances between sequences, you can analyze distances between structures described as p-values of rmsd (based on the method from the Dokholyan lab.)

Quickref::

      rna_clanstix.py --groups-auto 10 --color-by-homolog --shape-by-source  thf_ref_mapping_pk_refX.txt input2.clans

Running Clans:
To run CLANS you need to have Java 1.4 or better installed (java can be downloaded HERE). For full functionality you will also need the NCBI BLAST,PSI-BLAST and formatdb executables (NCBI). For command line parameters and basic help please refer to the README file.
(source: http://www.eb.tuebingen.mpg.de/research/departments/protein-evolution/software/clans.html)

.. image:: ../../rna_tools/tools/clanstix/doc/yndSrLTb7l.gif

The RMSDs between structures are converted into p-values based on the method from the Dokholyan lab or some hacky way developed by mmagnus .

Color groups
---------------------------------------
You can color your groups:

.. image:: ../../rna_tools/tools/clanstix/doc/rna_clanstix.png

To get colors, run a cmd like this::

   rna_clastix.py rnapz17_matrix_farfar_HelSeedCst.txt --groups 20:seq1+20+20+20+20+20+20:seq10

where with the ``+`` sign you separate groups. Each group has to have a number of structures. Optionally it can have a name, e.g., ``20:seq1``, use ``:`` as a separator. If a provided name is ``native`` then this group will be shown as starts.

Get inspiration for more colors (http://www.rapidtables.com/web/color/RGB_Color.htm)

How to use ClanstixRNA?
----------------------------------------

1. Get a matrix of distances, save it as e.g. matrix.txt (see Comment below)
2. run ClanstixRNA on this matrix to get an input file to Clans (e.g. clans_rna.txt)::

     rna_clanstix.py test_data/matrix.txt # clans.input will be created by default

3. open CLANS and click File -> Load run and load clans_run.txt
4. You're done! :-)

Comment: To get this matrix you can use for example another tool from the rna-pdb-tools packages::

     rna_calc_rmsd_all_vs_all.py -i rp18 -o rp18_rmsd.csv
     rna_clastix.py --groups 1:native+5:3dRNA+
           5:Chen+3:Dokh+5:Feng+5:LeeASModel+
           5:Lee+5:RNAComposer+10:RW3D+5:Rhiju+
           1:YagoubAli+3:SimRNA  rp18_rmsd.csv clans.in

     rna_clastix.py --groups 100+100+100+100+100+100+100+100+100+100+1:native  rp18_rmsd.csv

where ``rp18`` is a folder with structure and ``rp18_rmsd.csv`` is a matrix of all-vs-all rmsds.

.. image:: ../../rna_tools/tools/clanstix/doc/rp18_clanstix.png

Hajdin, C. E., Ding, F., Dokholyan, N. V, & Weeks, K. M. (2010). On the significance of an RNA tertiary structure prediction. RNA (New York, N.Y.), 16(7), 1340–9. doi:10.1261/rna.1837410

An output of this tool can be viewed using CLANS.

Frickey, T., & Lupas, A. (2004). CLANS: a Java application for visualizing protein families based on pairwise similarity. Bioinformatics (Oxford, England), 20(18), 3702–4. doi:10.1093/bioinformatics/bth444
"""
from __future__ import print_function
import argparse
import rna_tools.tools.rna_prediction_significance.rna_prediction_significance as pv
import numpy as np
import math
import logging
import time

logging.basicConfig(level=logging.INFO,
                format='%(message)s',
                datefmt='%m-%d %H:%M',
                filename='rna_clanstix.log',
                filemode='w')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

log = logging.getLogger()
log.setLevel(logging.INFO)



[docs]
class RNAStructClans:
    """Clans run.

    Usage::

        >>> f = open('matrix.txt')
        >>> ids = f.readline().replace('#','').split()
        >>> c = RNAStructClans(n=len(ids)) # 200?
        >>> c.add_ids(ids)
        >>> c.dist_from_matrix(f)
        >>> print(c.txt)
    """

    def __init__(self, n=10, dotsize=10):
        self.n = n
        self.comment = ''
        #cluster2d=false
        self.txt = """sequences=%i
<param>
maxmove=0.1
pval=%s
usescval=false
complexatt=true
cooling=1.0
currcool=1.0
attfactor=10.0
attvalpow=1
repfactor=10.0
repvalpow=1
dampening=1.0
minattract=1.0
cluster2d=true
blastpath=''
formatdbpath=''
showinfo=false
zoom=0.9
dotsize=%s
ovalsize=10
groupsize=4
usefoldchange=false
avgfoldchange=false
colorcutoffs=0.0;0.1;0.2;0.3;0.4;0.5;0.6;0.7;0.8;0.9;
colorarr=(230;230;230):(207;207;207):(184;184;184):(161;161;161):(138;138;138):(115;115;115):(92;92;92):(69;69;69):(46;46;46):(23;23;23):
</param>""" % (n, args.pvalue, str(dotsize))


[docs]
    def add_ids(self, ids):
        t = '\n<seq>\n'
        for i in ids:
            t += '>' + i + '\n'
            t += 'X' + '\n'
        t += '</seq>\n'
        if len(ids) != self.n:
            # print 'n != ids'
            raise Exception('n != ids')
        self.txt += t



[docs]
    def dist_from_matrix(self, rmsds, matrix=0, use_pv=False, use_input_values=False, dont_calc=False, debug=False):
        if dont_calc:
            print('Everything but the dists are generated. Use it to edit the original clans input file.')
            return # for some hardcore debugging ;-)
        t = '\n<hsp>\n'
        c = 0
        c2 = 0
        if debug: print(rmsds)
        for row in rmsds:
            for rmsd in row:
                if c != c2:
                    if use_input_values:
                        dist = float(str(rmsd).replace('e', 'E'))
                    elif use_pv:
                        dist = pv.get_p_value(rmsd, 1 * 38)[0]  # r.get_rmsd_to(r2), 3)
                    else:
                        # 1e-06 10-1 = 9 10-10 0
                        dist = '1.0E-' + str(int(math.floor(matrix.max()) - int(float(rmsd))))
                    t += str(c) + ' ' + str(c2) + ':' + str(dist) + '\n'
                c2 += 1
            c2 = 0
            c += 1

        t += '</hsp>\n'

        if not use_input_values:
            max = math.ceil(matrix.max())
            min = matrix[matrix>0].min()

            self.comment = '# max: %f min (non-zero): %f\n' % (max, min)
            self.comment += '# 1/4 ' + str((max - min) / 4) + ' ' + str(round((max - min) / 4, 0)) + '\n'
            self.comment += '# 1/2 ' + str((max - min) / 2) + ' ' + str(round((max - min) / 2, 0)) + '\n'
            self.comment += '# 1/3 ' + str(((max - min) / 4 ) * 3 ) + ' ' + str(round(((max - min) / 4) * 3, 0)) + '\n'
            for i in range(1,20):
                self.comment += '# connected points with RMSD lower than %iA 1.0E-%i\n' % (i, math.ceil(matrix.max()) - i)
            # 1E-11 = 0
            # 1E-10 = 1-0
            # 1E-9 = 2-1
        self.txt += t
        print(t)



[docs]
    def dist_from_matrix_mp(self, output_pmatrix_fn, max, min, lines, pmat=False, use_pv=False, use_input_values=False, debug=False):
        if debug:
            print('Everything but the dists are generated. Use it to edit the original clans input file.')
            return # for some hardcore debugging ;-)
        t = '\n<hsp>\n'
        myp = ''
        c = 0
        c2 = 0
        for l in lines:
            for rmsd in l:
                if c != c2:
                    if use_input_values:
                        dist = rmsd.replace('e', 'E')
                    if use_pv:
                        dist = pv.get_p_value(rmsd, 1 * 38)[0]  # r.get_rmsd_to(r2), 3)
                    else:
                        # 1e-06 10-1 = 9 10-10 0
                        dist = '1.0E-' + str(int(math.floor(matrix.max()) - int(float(rmsd))))
                    t += str(c) + ' ' + str(c2) + ':' + str(dist) + '\n'
                    myp += ' ' + str(dist)
                else:
                    myp += ' ' + '0.0'
                c2 += 1
            myp += '\n'
            c2 = 0
            c += 1

        t += '</hsp>\n'

        if not use_input_values:
            max = math.ceil(matrix.max())
            min = matrix[matrix>0].min()

            self.comment = '# max: %f min (non-zero): %f\n' % (max, min)
            self.comment += '# 1/4 ' + str((max - min) / 4) + ' ' + str(round((max - min) / 4, 0)) + '\n'
            self.comment += '# 1/2 ' + str((max - min) / 2) + ' ' + str(round((max - min) / 2, 0)) + '\n'
            self.comment += '# 1/3 ' + str(((max - min) / 4 ) * 3 ) + ' ' + str(round(((max - min) / 4) * 3, 0)) + '\n'
            for i in range(1,20):
                self.comment += '# connected points with RMSD lower than %iA 1.0E-%i\n' % (i, math.ceil(matrix.max()) - i)
            # 1E-11 = 0
            # 1E-10 = 1-0
            # 1E-9 = 2-1

        self.txt += t

        if pmat:
            with open(output_pmatrix_fn, 'w') as f:
                f.write(myp)
        return t




[docs]
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    """
    https://stackoverflow.com/questions/42908334/checking-if-a-matrix-is-symmetric-in-numpy
    """
    return np.allclose(a, a.T, rtol=rtol, atol=atol)




[docs]
def get_parser():
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('matrixfn', help="matrix")

    parser.add_argument('--groups-auto', help="define into how many groups make automatically", type=int)

    parser.add_argument('--color-by-homolog', help="color the same homolog in the same way", action='store_true')
    parser.add_argument('--one-target', help="color the same homolog in the same way", action='store_true')

    parser.add_argument('--shape-by-source', help="shape points based on source, SimRNA vs Rosetta (Rosetta models have 'min' in name')",
                        action='store_true')

    parser.add_argument('--debug', action='store_true')

    parser.add_argument('--groups-dot-size', type=int, default=8)

    parser.add_argument('--dont-calc', action='store_true', help="A simple and dirty trick to get "
                        "generates everything but the main distances from the matrix"
                        "useful if you want to run the script to generate different settings, such as"
                        "colors, groups etc. Run the script and then replace parts of the original "
                        "file with the matrix")

    parser.add_argument('--groups', help="groups, at the moment up to 7 groups can be handle"
                        "--groups 1:native+100:zmp+100:zaa+100:zbaa+100:zcp+100:znc"
                        "--groups 1:native+100:nats+100:hom1+100:hom2+100:hom3+100:hom4"
                        "native will light green"
                        "zmp will be forest green"
                        "(easy to be changed in the future)")

    parser.add_argument('--use-pvalue', action='store_true', help="")
    parser.add_argument('--use-input-values', action='store_true', help="")

    parser.add_argument('--pvalue', default="1.0E-15", help="set p-value for clans.input, default: 1.0E-15")


    parser.add_argument('--output', help="input file for clans, e.g., clans.input", default="clans.input")
    parser.add_argument('--output-pmatrix', action='store_true', help="p value matrix will be saved, see --output-matrix-fn to define name")
    parser.add_argument('--output-pmatrix-fn', default="pmatrix.txt", help="filename of output matrix, pmatrix.txt by default")
    parser.add_argument('--multiprocessing', action='store_true', help="run calculations in parallel way, warning: extra libraries required, see the Docs")

    return parser



#main
if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()
    debug = args.debug  # as a short cut later
    f = open(args.matrixfn)
    # OK, check if there is a header = the line with '#'
    headers = f.readline()
    if headers.strip().startswith('#'):
        # if yes, then split remove # and split into lists
        ids = headers.replace('#', '').split()
        for i in ids:
            print(i)
    else:
        # if no, then make a list form [0, # of items in the first line]
        ids = [str(i) for i in range(0, len(headers.split()))]
        f.seek(0)

    # get max
    logging.info(time.strftime("%Y-%m-%d %H:%M:%S"))

    # Collect rmsds into a list
    rmsds = []
    for l in f:
            #for rmsd in map(float,l.split()):  # map(float, s.split())
       		rmsds.append(map(float,l.split()))

    # warning:
    # eh, this matrix is not really used by clanstix main engine
    # it's used to calc min and max value of a matrix
    matrix = np.loadtxt(args.matrixfn)
    if check_symmetric(matrix):
        if args.debug: print('Matrix is symmetrical! ', matrix.shape)
    else:
        raise Exception('Matrix is not symmetrical! Check your matrix', matrix.shape)

    # keep dot quite big by default,
    # but if you use dotsize then keep this one 0
    dotsize =  8
    if args.groups_auto or args.groups:
        dotsize =  0

    c = RNAStructClans(n=len(ids), dotsize=dotsize)  # 200?
    c.add_ids(ids)
    if debug:
        print('dist_from_matrix...')

    if args.multiprocessing:
        import multiprocessing as mp
        #import dill
        #import parmap
        from pathos.multiprocessing import ProcessingPool

        matrix = np.loadtxt(args.matrixfn)
        max = int(math.floor(matrix.max()))
        min = matrix[matrix>0].min()
        clans_list_of_pvalues = ''
        pool = ProcessingPool(mp.cpu_count())
        x=pool.map(c.dist_from_matrix_mp, [args.output_pmatrix_fn], [max], [min], [rmsds], [args.output_pmatrix], )
        pool.close()
        for i in x:
            clans_list_of_pvalues = clans_list_of_pvalues.join(i)
    else:
        c.dist_from_matrix(rmsds, matrix, args.use_pvalue, args.use_input_values, args.dont_calc, args.debug)
        if debug:
            print('process the matrix...')
    #
    # DEFINE GROUPS
    #
    # 1+20+20+20+20+20
    seqgroups = ''
    #colors = ['0;255;102;255', # ligthgreen 1
    #          '0;102;0;255', # forest 2
    # A list of colors used later ....
    colors = [
              '255;102;102;255', # red 3
              '51;51;255;255', # blue 4
              '0;255;255;255', # light blue +1
              '180;38;223;255', # violet 5
              '64;64;64;255', # grey 6
              '255;128;0;255', # orange 7
              '240;230;140;255', #khaki
              '210;105;30;255', # chocolate
              '0;255;255;255',  # cyjan
              '128;0;128;255', # purple 8
              '0;128;128;255', # Teal 9
              '128;0;0;255', # maroon 10

               ]

    # This is pretty much the same list as above, but in here I have more distinguishable colors,
    # as I should be used with less number of groups
    colors_homologs = [
              '255;102;102;255', # red 3
              '51;51;255;255', # blue 4
              '64;64;64;255', # grey 6
              '128;0;128;255', # purple 8
              '128;0;0;255', # maroon 10
              '0;255;255;255',  # cyjan
              '237;41;57;255', # red
              #'210;105;30;255', # chocolate
               ]

    # OK, this is the trick
    # if args.groups_auto is on, then you built args.groups automatically,
    # and then just simply run the next if as it was args.groups in the arguments
    # pretty cool ;-)
    if args.groups_auto:
        # arsg_groups = ''
        from collections import OrderedDict
        # collect groups
        #groups = []
        groups = OrderedDict()
        for i in ids:
            #
            # collect homologs gmp_
            # simrna and farna
            print(i)
            group_name = i[:args.groups_auto]
            if group_name in groups:
                groups[group_name] += 1
            else:
                groups[group_name] = 1
        groups_str = ''
        for g in groups:
            groups_str += str(groups[g]) + ':' + g + '+'
        #
        # this is the trick, you
        args.groups = groups_str[:-1]
        print(groups_str)
        # change this to get 1:hccp+10:zmp+10:xrt

    if args.groups:
        # this is a manual way how to do it
        groups = args.groups.split('+')
        seqgroups = '<seqgroups>\n'
        curr_number = 0
        homologs_colors = {}
        if args.debug: print(args.groups)
        for index, group in enumerate(groups):
            # parse groups
            # type and size will be changed for native
            size = args.groups_dot_size
            dottype = 0
            color = '' # use for diff color selection if tar
            if ':' in group:
                nstruc, name = group.split(':')
                if name == 'native':
                    dottype = 8
                    size = 20
                    color = '0;128;128;255' # olive

                if 'solution' in name:
                    dottype = 8
                    size = 30  # solution is bigger
                    color = '0;128;128;255' # olive

                if args.one_target: # target is target, dont distinguish it
                    if 'tar' in name and 'tar_min' not in name: # SimRNA
                        dottype = 7
                        size = size # 7  # size of SimRNA reference seq points
                        color = '0;255;102;255' # ligthgreen 1

                    if 'tar_min' in name:
                        dottype = 9
                        size = size # 7 # size of Rosetta reference seq points
                        color = '0;102;0;255' # forest 2
                else:
                    if 'tar' in name: # SimRNA
                        size = size #8  # size of SimRNA reference seq points
                        dottype = 0
                        color = '0;255;102;255' # ligthgreen 1

                if args.shape_by_source:
                    # Rosetta models are diamond now
                    if 'min' in name:
                        dottype = 2

                # color by homolog
                if args.color_by_homolog:
                    # 10:gxx_6bd266+10:gxx_min.ou+10:gbaP_d2b57+10:gbaP_min.o+10:gbx_00de79+10:gbx_min.ou+10:gapP_d9d22+10:gapP_min.o+10:gmp_faa97e+10:gmp_min.ou
                    tmp = name.split('_')
                    homolog_name = tmp[0]
                    if debug: 'homolog_name', homolog_name
                    # ignore tar and solution, their colors are defined above ^
                    ## [mm] simrna5x100farna5x100$ git:(master) ✗ rna_clastix.py --groups-auto 8 --color-by-homolog --shape-by-source rp14sub_ref_mapping_refX.txt input2.clans --debug
                    ## 2019-01-09 12:23:21
                    ## 100:tar_min.+100:tar_rp14+1:solution+100:cy2_min.+100:cy2_r14a+100:aj6_min.+100:aj6_r14a
                    ## {'cy2': '210;105;30;255'}
                    ## {'cy2': '210;105;30;255'}
                    ## {'cy2': '210;105;30;255', 'aj6': '0;255;255;255'}
                    ## {'cy2': '210;105;30;255', 'aj6': '0;255;255;255'}
                    ## # max: 18.000000 min (non-zero): 0.810000
                    if homolog_name == 'tar' or homolog_name.startswith('solution'):
                        pass
                    else:
                        if homolog_name in homologs_colors:
                            color = homologs_colors[homolog_name]
                        else:
                            homologs_colors[homolog_name] = colors_homologs.pop()
                            color = homologs_colors[homolog_name]
                        if debug: print(homologs_colors)
            else:
                nstruc = group
                name = 'foo'

            # color is set in the args.color_by_homologs
            # craft seqgroups
            seqgroups += '\n'
            seqgroups += "name=%s\n" % name

            # color hack
            if color:  # this color is fixed for native right now
                seqgroups += "color=%s\n" % color
            else:
                # if beyond index, use different shape
                #try:
                #print(index, len(colors))
                if index >= len(colors):
                    index = index - len(colors) # reset color index
                    dottype = 6  # set dottype when the colors are done 
                seqgroups += "color=%s\n" % colors[index]

            seqgroups += "type=%i\n" % dottype
            # color hack
            seqgroups += "size=%i\n" % size
            seqgroups += "hide=0\n"

            if debug:
                print('name: ' + name + ' ' + colors[index])
            # get numbers - convert nstruc into numbers in Clans format 0;1; etc.
            # remember: it starts from 0
            # --groups 1:hccp+10:zmp+10:xrt
            # hccp
            # [0]
            # zmp
            # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
            # xrt
            # [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
            numbers = range(curr_number, curr_number + int(nstruc))  # 0, 1
            curr_number = curr_number + int(nstruc)
            seqgroups += "numbers=%s;\n" % ';'.join([str(number) for number in numbers])
        seqgroups += '</seqgroups>\n'

    with open(args.output, 'w') as f:
        f.write(c.txt)
        if args.multiprocessing:
            f.write(clans_list_of_pvalues)
        f.write(seqgroups)
        f.write(c.comment)
    # if debug: print(c.txt)
    print(c.comment)

    logging.info(time.strftime("%Y-%m-%d %H:%M:%S"))
    if debug:
        print(seqgroups)