Source code for rna_tools.tools.rna_rosetta.rna_rosetta_run

#!/usr/bin/env python
# -*- coding: utf-8 -*-
r"""rna_rosetta_run.py - prepare & run ROSETTA simulations

Based on C. Y. Cheng, F. C. Chou, and R. Das, Modeling complex RNA tertiary folds with Rosetta, 1st ed., vol. 553. Elsevier Inc., 2015.
http: // www.sciencedirect.com / science / article / pii / S0076687914000524

The script makes(1) a folder for you job, with seq.fa, ss.fa, input file is copied as input.fa to the folder(2) make helices(3) prepare rosetta input files(4) sends jobs to the cluster.

The header is take from the fast file(`` > /header / ``) not from the filename of your Fasta file.

I discovered this::

     qstat -xml | tr '\n' ' ' | sed 's#<job_list[^>]*>#\n#g' \
>   | sed 's#<[^>]*>##g' | grep " " | column -t

(https://stackoverflow.com/questions/26104116/qstat-and-long-job-names) so there is now need to shorted my job ids.

Helix
-------------------------------------------------------

Run::

    rna_rosetta_run.py -i -e -r -g -c 200 cp20.fa

`-i`::

    # prepare a folder for a run
    >cp20
    AUUAUCAAGAAUCUCAAAGAGAGAUAGCAACCUGCAAUAACGAGCAAGGUGCUAAAAUAGAUAAGCCAAAUUCAAUUGGAAAAAAUGUUAA
    .(((((....(((((.....)))))(((..(((((..[[[[..)).))).)))......))))).((((......)))).......]]]].

    [peyote2] ~ rna_rosetta_run.py -i cp20.fa
    run rosetta for:
    cp20
    AUUAUCAAGAAUCUCAAAGAGAGAUAGCAACCUGCAAUAACGAGCAAGGUGCUAAAAUAGAUAAGCCAAAUUCAAUUGGAAAAAAUGUUAA
    .(((((....(((((.....)))))(((..(((((..[[[[..)).))).)))......))))).((((......)))).......]]]].
    /home / magnus // cp20 / created
    Seq & ss created

Troubleshooting.

If one of the helices is missing you will get::

    IOError: [Errno 2] No such file or directory: 'helix1.out'
    rosetta_submit.py README_FARFAR o 500 100 taf
    Could not find:  README_FARFAR

and the problem was a1 and g8 pairing::

    outputting command line to:  helix0.RUN  # previous helix #0
    Sequence:  AUGG CCGG
    Secstruc:  (((())))
    Not complementary at positions a1 and g8!
    Sequence:  GUGGG CCCAU
    Secstruc:  ((((()))))

    Writing to fasta file:  helix2.fasta  # next helix #2

My case with a modeling of rp12

    Sequence:  cc gc
    Secstruc:  (())
    Not complementary at positions 1 and 4!

edit the secondary structure, run the program with -i(init, to overwrite seq.fa, ss.fa) and then it works.


Notes::

   rp17hc 6 characters

"""
from __future__ import print_function
import argparse
import textwrap
import os
import glob
import subprocess
import shutil
import math

import os
import sys

try:
    from rna_tools.rna_tools_config import RNA_ROSETTA_RUN_ROOT_DIR_MODELING
except:
    RNA_ROSETTA_RUN_ROOT_DIR_MODELING = ''
    print ('Set up rna_rosetta_run_root_dir_for_modeling in rpt_config_local.py')



[docs]
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
    pass




[docs]
def get_parser():
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=CustomFormatter)

    parser.add_argument('-i', '--init', help='prepare _folder with seq and ss',
                        action='store_true')
    parser.add_argument('-e', '--helices', help='produce h(E)lices',
                        action='store_true')
    parser.add_argument('-r', '--rosetta', help='prepare rosetta files (still you need `go` to send jobs to a cluster)',
                        action='store_true')
    parser.add_argument('-g', '--go', help=go.__doc__,
                        action='store_true')
    parser.add_argument('-m', '--motif', help="include a motif file, e.g. -s E-loop_1q9a_mutated_no_flanks_renumber.pdb")
    parser.add_argument('-n', '--nstruc', help="# of structures you want to get",
                        default=10000, type=int)

    parser.add_argument('-c', '--cpus', help='# of cpus to be used', default=200,
                        type=int)

    parser.add_argument('--sandbox', help="where to run it (default: RNA_ROSETTA_RUN_ROOT_DIR_MODELING",
                        default=RNA_ROSETTA_RUN_ROOT_DIR_MODELING)

    parser.add_argument('file', help=textwrap.dedent(
        """file: \n > a04\nUAUAACAUAUAAUUUUGACAAUAUGGGUCAUAAGUUUCUACCGGAAUACCGUAAAUAUUCUGACUAUGUAUA\n((((.((((...((.((((.......)))).))........(.(((((.......))))).)..))))))))"""))
    return parser




[docs]
def prepare_folder(args, header, seq, ss, path):
    """Make folder for you job, with seq.fa, ss.fa, input file is copied as input.fa to the folder.

    For ss lowercase is needed when used with motifs, otherwise::

        [peyote2] aa20$ rna_rosetta_run.py -r -m E-loop_1q9a_mutated_no_flanks_renumber_for_acy20.pdb ~/aa20.fa
        2019-05-03 21:31:30,842 rpt_config_local.py::<module>::rpt_config_local loading...
        run rosetta for:
        aa20
        UACGUUCAUCAUCCGUUUGGAUGACGGAAGUAAGCGAAAGCUGAAGGAACGCAUG
        ..(((((.((((((....))))))..[.....(((....)))....)))))]...
        rna_denovo_setup.py -fasta seq.fa -secstruct_file ss.fa -cycles 20000 -no_minimize -nstruct 50  -s E-loop_1q9a_mutated_no_flanks_renumber_for_acy20.pdb  -silent helix0.out helix1.out helix2.out -input_silent_res 3-7 47-51 9-14 19-24 33-35 40-42

        Sequence:  UACGUUCAUCAUCCGUUUGGAUGACGGAAGUAAGCGAAAGCUGAAGGAACGCAUG
        Secstruc:  ..(((((.((((((....))))))..[.....(((....)))....)))))]...
        aaguagaag
        AAGUAGAAG
        Traceback (most recent call last):
          File "/home/magnus/opt/rosetta_src_2016.13.58602_bundle/tools/rna_tools/bin//rna_denovo_setup.py", line 170, in <module>
            raise ValueError('The sequence in %s does not match input sequence!!' % pdb)
        ValueError: The sequence in E-loop_1q9a_mutated_no_flanks_renumber_for_acy20.pdb does not match input sequence!!
        rosetta_submit.py README_FARFAR o 200 100 aa20_
        Could not find:  README_FARFAR
    """
    d = path
    try:
        os.mkdir(d)
        print(d, 'created')
    except OSError:
        print(d, 'created is already created')
        pass

    with open(d + "seq.fa", "w") as f:
        f.write(header + '\n')
        f.write(seq)

    with open(d + "ss.fa", "w") as f:
        f.write(ss.lower())
    print('Seq & ss created')
    shutil.copyfile(args.file, d + 'input.fa')




[docs]
def prepare_helices():
    """Make helices(wrapper around 'helix_preassemble_setup.py')

    .. warning:: I think multiprocessing of helixX.run does not work."""
    # run helix_p..
    cmd = 'helix_preassemble_setup.py -secstruct ss.fa -fasta seq.fa'
    os.system(cmd)

    # find all helix
    helix_runs = glob.glob('*RUN')
    print(helix_runs)

    f = open('HRUNS', 'w')
    for h in helix_runs:
        f.write(open(h).read().strip() + ' & \n')
    f.close()

    # does not work (!)
    # os.system('chmod +x CMDLINES')
    # os.system('./CMDLINES')
    # ./CMDLINES: 2: source: not found

    # os.system runs multiprocessing, but does not wait for the rest of the program
    # hmm... it does not wait because I used & ???
    # this works in multirpocessing mode but it does not wait for `-r` !!! so if your run -e only it's OK.
    # don't combile -e with -r because making helices will not wait to run -r (!) and  you will get an error
    # and only helices made and then the program will finish
    if False:
        os.system('bash HRUNS')
    else:
        p = subprocess.Popen(open('HRUNS').read(),
                             shell=True, stderr=subprocess.PIPE)
        p.wait()
        stderr = p.stderr.read().strip()
        if stderr:
            print(stderr)




[docs]
def prepare_rosetta(header, cpus, motif, nstruc):
    """Prepare ROSETTA using rna_denovo_setup.py

    cpus is used to calc nstruc per job to get 10k structures per full run::

    Args:

      nstruc(int): how many structures you want to obtain
      nstruct = int(math.floor(20000 / cpus))
      motif: motif file; e.g., -s E-loop_1q9a_mutated_no_flanks_renumber.pdb
      50 (nstruc) = 10k / 200 (cpus)

    """
    # get list line
    helices = open('CMDLINES').readlines()[-1].replace('#', '')
    njobs = cpus  # 500
    nstruct = int(math.floor(nstruc / cpus))  # 20000/500 -> 40
    if motif:
        cmd_motif = ' -s ' + motif
    else:
        cmd_motif = ''
    cmd = 'rna_denovo_setup.py -fasta seq.fa -secstruct_file ss.fa -cycles 20000 -no_minimize -nstruct ' + \
        str(nstruct) + ' ' + cmd_motif + ' ' + helices
    print(cmd)
    os.system(cmd)
    # change to 50 per job (!)
    # 50 * 100 = 10k ?
    # dont change this 100 (!) it might not run on peyote2 with values like 99999 !
    # cmd is
    # rna_tools/bin//rosetta_submit.py <text file with rosetta command> <outdir> <# jobs> <# hours> <job name>
    # manually: [peyote2] a22$ rosetta_submit.py README_FARFAR o 200 100 a22_
    cmd = 'rosetta_submit.py README_FARFAR o ' + \
        str(njobs) + ' 100 ' + header + '_'
    print(cmd)
    os.system(cmd)




[docs]
def go():
    """send jobs to a cluster(run qsubMINI)"""
    os.system('chmod +x ./qsubMINI')
    os.system('./qsubMINI')




[docs]
def main():
    """Pipeline for modeling RNA"""
    args = get_parser().parse_args()

    if args.file:
        f = open(args.file)
        header = f.readline().replace('>', '').replace(' ', '').strip()
        seq = f.readline().strip()
        ss = f.readline().strip()
        cpus = int(args.cpus)
        print('run rosetta for:')
        print(header)

        print(seq)
        print(ss)

        if RNA_ROSETTA_RUN_ROOT_DIR_MODELING.strip() == '':
            print('Set RNA_ROSETTA_RUN_ROOT_DIR_MODELING in your rpt_config file.')
            return

        path = args.sandbox + os.sep + header + \
            os.sep  # RNA_ROSETTA_RUN_ROOT_DIR_MODELING
        curr = os.getcwd()
        if args.init:
            prepare_folder(args, header, seq, ss, path)
        try:
            os.chdir(path)
        except OSError:
            print('You have to make a folder first! use --init')
            sys.exit(1)

        if args.helices:
            prepare_helices()
        if args.rosetta:
            prepare_rosetta(header, cpus, args.motif, args.nstruc)
        if args.go:
            go()

        os.chdir(curr)



# main
if __name__ == '__main__':
    main()