Source code for rna_tools.tools.rna_rosetta.rna_rosetta_cluster

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""rna_rosetta_cluster.py - cluster silent Rosetta files

A wrapper to ROSETTA tools for RNA modeling

Based on C. Y. Cheng, F. C. Chou, and R. Das, Modeling complex RNA tertiary folds with Rosetta, 1st ed., vol. 553. Elsevier Inc., 2015.
http://www.sciencedirect.com/science/article/pii/S0076687914000524

::

    rna_rosetta_cluster.py ade_min.out 20000

Take n * 0.005 (.5%) of all frames and put them into `selected.out`. Then the tool clusters this `selected.out`.
"""
from __future__ import print_function
import platform
import argparse
import os
import subprocess
import math
import logging
import shutil
import time
#limit_clusters = 1  # if you want to change this, then rewrite procedure of stopping!

logging.basicConfig(level=logging.INFO)


[docs] def get_parser(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--no_select', action='store_true', help="Don't run selection once again. Use selected.out in the current folder") parser.add_argument('--radius-inc-step', type=float, default=0.5, help="radius incremental step, default 0.5") parser.add_argument('--limit-clusters', default=5, help="# of clusters") parser.add_argument('file', help='ade.out') parser.add_argument('n', type=int, help='# of total structures') return parser
[docs] def get_no_structures_in_first_cluster(fn): """Get # of structures in a silent file. Args: fn (string): a filename to a silent file Returns int: # of structures in a silent file""" p = subprocess.Popen('grep SCORE ' + fn + ' | grep c.0 | wc -l', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() stderr = p.stderr.read().strip() if stderr: print(stderr) return int(p.stdout.read().strip())
[docs] def get_selected(file, nc): """Get selected for clustering""" cmd = 'silent_file_sort_and_select.py ' + file + ' -select 1-' + str(nc) + ' -o selected.out' logging.info(cmd) p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.wait() stderr = p.stderr.read().strip() if stderr: print(stderr)
[docs] def cluster(radius, limit_clusters): """Internal function of cluster_loop: It removes cluster.out first.""" try: os.remove("cluster.out") except OSError: pass if platform.system() == "Darwin": cmd = 'cluster.macosclangrelease' if platform.system() == "Linux": cmd = "cluster.default.linuxgccrelease" cmd += " -in:file:silent selected.out -in:file:fullatom -out:file:silent_struct_type binary -export_only_low false -out:file:silent cluster.out -cluster:limit_clusters " + str(limit_clusters) + " -cluster:radius " + str(radius) #'-out:prefix ade_ "#| tee clustering.out" logging.info(cmd) os.system(cmd)
#p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # p.wait() #stderr = p.stderr.read().strip() # if stderr: # print stderr
[docs] def extract(): #cmd = "~/src/rna-pdb-tools/rna_tools/utils/rna_rosetta/rna_extract_lowscore_decoys.py cluster0.out 5" if platform.system() == "Darwin": cmd = 'extract_pdbs.default.macosclangrelease -in::file::silent cluster.out' if platform.system() == "Linux": cmd = 'extract_pdbs.default.linuxgccrelease -in::file::silent cluster.out' os.system(cmd)
[docs] def cluster_loop(ns, radius_inc_step, limit_clusters): """Go from radius 1 to get 1/6 of structures of ns (# of selected structures) in the first cluster, then it stops.""" radius = 1 # should be 1 while 1: cluster(radius, limit_clusters) ns1 = get_no_structures_in_first_cluster('cluster.out') print('%i in #1 cluster %i from the total number of structures' % (ns1, ns)) if ns1 > ns * .16: # 1/6 break radius += radius_inc_step logging.info('radius %i ; %i in #1 cluster %i from the total number of structures' % (radius, ns1, ns)) return radius, ns1
[docs] def get_no_structures(file): p = subprocess.Popen('cat ' + file + ' | grep SCORE | wc -l', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stderr = p.stderr.read().strip() if stderr: print(stderr) return int(p.stdout.read().strip()) - 1
[docs] def run(): """Pipline for modeling RNA""" args = get_parser().parse_args() logging.info(time.strftime("%Y-%m-%d %H:%M")) logging.info(args) # get_no_structures(args.file) # if you mini then # is the total number of structures n = args.n nc = int(math.ceil(n * 0.005)) # nc no for clustring print('# total:', n, ' # selected:', nc) # selection if not args.no_select: get_selected(args.file, nc) ns = get_no_structures('selected.out') print('# selected:', ns) ns = get_no_structures('selected.out') radius, ns1 = cluster_loop(ns, args.radius_inc_step, args.limit_clusters) # loop over selected to get 1/6 in the biggest cluster extract() print("# of structures in the biggest cluster", ns1) logging.info('radius %f, step %f ; %i in #1 cluster %i from the total number of structures' % (radius, args.radius_inc_step, ns1, ns))
# main if __name__ == '__main__': run()