See attached Perl program. If you can run a Perl program, it does exactly
what you need, and a few more things too.
Craig

On Thu, Jan 31, 2019 at 12:03 PM gehtalles <toz...@t-online.de> wrote:

> I got a massive SDF file with some 300k structures I need to split into
> smaller files
>
> I tried
>
> babel infile.sdf -isdf -osdf -f 1 -l 1000 -O outfile.sdf
>
> but all meta data defined in
>
> > <data name>
> data content
>
> are discarded in the outfile.
>
> Is there an option I missed to include them in the new file?
>
>
>
> --
> Sent from: http://forums.openbabel.org/General-discussion-f3090658.html
>
>
> _______________________________________________
> OpenBabel-discuss mailing list
> OpenBabel-discuss@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/openbabel-discuss
>


-- 
---------------------------------
Craig A. James
Chief Technology Officer
eMolecules, Inc.
---------------------------------
#!/usr/bin/perl
#======================================================================
# FILE:		split_sdf.pl
# AUTHOR:	Craig A. James
# DESCRIPTION:
#	Splits an MDL SD File into manageable-sized chunks.  
#	
#======================================================================	
# Copyright (c) 2006-2009, eMolecules Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation version 2 of the License.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#======================================================================

use strict;

$| = 1;

sub usage {
    print "$_[0]\n" if defined($_[0]);
    print <<EOF;
Usage: split_sdf.pl [N] [-skip N] [-do N] [file.sdf]
  N is number of SD records per file
  -skip N        Skip N records before starting output
  -do N          Do N records and then quit
  -random N/M    Randomly only print approximately N/M of the total
  -zap_extra_lf  Zap extra linefeed between records

  Files will be named with numeric suffix, e.g. "split_sdf.pl foo.sdf"
  will generate files like these:
    foo_1.sdf
    foo_2.sdf 
    ... etc.

EOF
    print "Hit [Return] to exit...";
    my $line = <STDIN>;
    exit;
}


my $n = 50000;
if ($ARGV[0] =~ m/^[0-9]+$/) {
    $n = $ARGV[0];
    shift;
}

my $skip = 0;
if ($ARGV[0] eq "-skip") {
    shift;
    $skip = shift;
}

my $dorecords = 0;
if ($ARGV[0] eq "-do") {
    shift;
    $dorecords = shift;
}

my ($rand_n, $rand_m);
if ($ARGV[0] eq "-random") {
    shift;
    ($rand_n, $rand_m) = split(/\//, $ARGV[0]);
    shift;
    $rand_n = int($rand_n);
    $rand_m = int($rand_m);
    $rand_n = 0 unless $rand_n > 0 && $rand_m > 0;
    print "Random $rand_n/$rand_m\n";
}

my $zap_extra_lf = 0;
if ($ARGV[0] eq "-zap_extra_lf") {
    shift;
    $zap_extra_lf = 1;
}

my ($basename, $filename, $suffix);

if (defined($ARGV[0]) && $ARGV[0] ne "-") {
    $filename = shift;
    usage ("Can't read file '$filename'") if ! -r $filename;
    $basename = $filename;
    $basename =~ s/\.[a-zA-Z]*$//;
    $suffix = $filename;
    $suffix =~ s/^.*\.([a-zA-Z]+)$/$1/;
}
else {				# no filename given?
    $filename = "-";		# use standard input
    $basename = "sdfile";
    $suffix = "sdf";
}

my $filesuffix = 1;
my $count = 0;
my $total = 0;

open(SDFILE, "<$filename") || die("Failed to open '$filename' for reading!");
my $zf_filesuffix = sprintf("%04d", $filesuffix);
my $outfile = "${basename}_${zf_filesuffix}.$suffix";
open(OUTFILE, ">$outfile") || die("Couldn't open '$outfile' for writing.");

if ($dorecords > 0) {
    print("Note: output limited to $dorecords records\n");
}

if ($skip > 0) {
    print("Skipping $skip records...\n");
} else {
    print "Writing file '$outfile' ... ";
}

my $skip_record;
my $r = int(rand($rand_m) + 1.0);
$skip_record = ($rand_n > 0) && ($r > $rand_n);

SDREC:
while (<SDFILE>) {
    print OUTFILE $_ unless $skip > 0 || $skip_record;
    if ($_ =~ m/^\$\$\$\$/) {
	$r = int(rand($rand_m) + 1.0);
	if (!$skip_record) {
	    if ($skip-- <= 0) {
		$count++;
		$total++;
	    }
	    elsif ($skip == 0) {
		print "Writing file '$outfile' ... ";
	    }
	    if ($skip < 0 && ($count % $n) == 0) {
		close OUTFILE;
		print "$count records\n";
		$filesuffix++;
		$zf_filesuffix = sprintf("%04d", $filesuffix);
		$outfile = "${basename}_${zf_filesuffix}.$suffix";
		open(OUTFILE, ">$outfile") || die("Couldn't open '$outfile' for writing.");
		print "Writing file '$outfile' ... ";
		$count = 0;
	    }
	    if ($skip < 0 && $dorecords > 0) {
		$dorecords--;
		last SDREC if ($dorecords == 0);
	    }
	    if ($skip < 0 && $zap_extra_lf) {

		# This is kind of hokey.  Do a "read ahead" of five lines, and
		# try to determine whether there's an extra line or not.  If
		# there is, discard it.  Then print the lines that are left.
		my @sdf_header;
		foreach my $i (0..4) {
		    $sdf_header[$i] = <SDFILE>;
		}
		if (   $sdf_header[0] =~ m/^\n/
		    && $sdf_header[4] =~ m/[ 0-9][ 0-9][0-9][ 0-9][ 0-9][0-9]/) {
		    shift @sdf_header;
		}
		print OUTFILE join("", @sdf_header);
	    }
	}
	$r = int(rand($rand_m) + 1.0);
	$skip_record = ($rand_n > 0) && ($r > $rand_n);
    }
}
print "$count records\n";
close OUTFILE;
print "Done: $total records total.\n";
_______________________________________________
OpenBabel-discuss mailing list
OpenBabel-discuss@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/openbabel-discuss

Reply via email to