See attached Perl program. If you can run a Perl program, it does exactly what you need, and a few more things too. Craig
On Thu, Jan 31, 2019 at 12:03 PM gehtalles <toz...@t-online.de> wrote: > I got a massive SDF file with some 300k structures I need to split into > smaller files > > I tried > > babel infile.sdf -isdf -osdf -f 1 -l 1000 -O outfile.sdf > > but all meta data defined in > > > <data name> > data content > > are discarded in the outfile. > > Is there an option I missed to include them in the new file? > > > > -- > Sent from: http://forums.openbabel.org/General-discussion-f3090658.html > > > _______________________________________________ > OpenBabel-discuss mailing list > OpenBabel-discuss@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/openbabel-discuss > -- --------------------------------- Craig A. James Chief Technology Officer eMolecules, Inc. ---------------------------------
#!/usr/bin/perl #====================================================================== # FILE: split_sdf.pl # AUTHOR: Craig A. James # DESCRIPTION: # Splits an MDL SD File into manageable-sized chunks. # #====================================================================== # Copyright (c) 2006-2009, eMolecules Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. #====================================================================== use strict; $| = 1; sub usage { print "$_[0]\n" if defined($_[0]); print <<EOF; Usage: split_sdf.pl [N] [-skip N] [-do N] [file.sdf] N is number of SD records per file -skip N Skip N records before starting output -do N Do N records and then quit -random N/M Randomly only print approximately N/M of the total -zap_extra_lf Zap extra linefeed between records Files will be named with numeric suffix, e.g. "split_sdf.pl foo.sdf" will generate files like these: foo_1.sdf foo_2.sdf ... etc. EOF print "Hit [Return] to exit..."; my $line = <STDIN>; exit; } my $n = 50000; if ($ARGV[0] =~ m/^[0-9]+$/) { $n = $ARGV[0]; shift; } my $skip = 0; if ($ARGV[0] eq "-skip") { shift; $skip = shift; } my $dorecords = 0; if ($ARGV[0] eq "-do") { shift; $dorecords = shift; } my ($rand_n, $rand_m); if ($ARGV[0] eq "-random") { shift; ($rand_n, $rand_m) = split(/\//, $ARGV[0]); shift; $rand_n = int($rand_n); $rand_m = int($rand_m); $rand_n = 0 unless $rand_n > 0 && $rand_m > 0; print "Random $rand_n/$rand_m\n"; } my $zap_extra_lf = 0; if ($ARGV[0] eq "-zap_extra_lf") { shift; $zap_extra_lf = 1; } my ($basename, $filename, $suffix); if (defined($ARGV[0]) && $ARGV[0] ne "-") { $filename = shift; usage ("Can't read file '$filename'") if ! -r $filename; $basename = $filename; $basename =~ s/\.[a-zA-Z]*$//; $suffix = $filename; $suffix =~ s/^.*\.([a-zA-Z]+)$/$1/; } else { # no filename given? $filename = "-"; # use standard input $basename = "sdfile"; $suffix = "sdf"; } my $filesuffix = 1; my $count = 0; my $total = 0; open(SDFILE, "<$filename") || die("Failed to open '$filename' for reading!"); my $zf_filesuffix = sprintf("%04d", $filesuffix); my $outfile = "${basename}_${zf_filesuffix}.$suffix"; open(OUTFILE, ">$outfile") || die("Couldn't open '$outfile' for writing."); if ($dorecords > 0) { print("Note: output limited to $dorecords records\n"); } if ($skip > 0) { print("Skipping $skip records...\n"); } else { print "Writing file '$outfile' ... "; } my $skip_record; my $r = int(rand($rand_m) + 1.0); $skip_record = ($rand_n > 0) && ($r > $rand_n); SDREC: while (<SDFILE>) { print OUTFILE $_ unless $skip > 0 || $skip_record; if ($_ =~ m/^\$\$\$\$/) { $r = int(rand($rand_m) + 1.0); if (!$skip_record) { if ($skip-- <= 0) { $count++; $total++; } elsif ($skip == 0) { print "Writing file '$outfile' ... "; } if ($skip < 0 && ($count % $n) == 0) { close OUTFILE; print "$count records\n"; $filesuffix++; $zf_filesuffix = sprintf("%04d", $filesuffix); $outfile = "${basename}_${zf_filesuffix}.$suffix"; open(OUTFILE, ">$outfile") || die("Couldn't open '$outfile' for writing."); print "Writing file '$outfile' ... "; $count = 0; } if ($skip < 0 && $dorecords > 0) { $dorecords--; last SDREC if ($dorecords == 0); } if ($skip < 0 && $zap_extra_lf) { # This is kind of hokey. Do a "read ahead" of five lines, and # try to determine whether there's an extra line or not. If # there is, discard it. Then print the lines that are left. my @sdf_header; foreach my $i (0..4) { $sdf_header[$i] = <SDFILE>; } if ( $sdf_header[0] =~ m/^\n/ && $sdf_header[4] =~ m/[ 0-9][ 0-9][0-9][ 0-9][ 0-9][0-9]/) { shift @sdf_header; } print OUTFILE join("", @sdf_header); } } $r = int(rand($rand_m) + 1.0); $skip_record = ($rand_n > 0) && ($r > $rand_n); } } print "$count records\n"; close OUTFILE; print "Done: $total records total.\n";
_______________________________________________ OpenBabel-discuss mailing list OpenBabel-discuss@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/openbabel-discuss