#!/usr/local/bin/perl 

################################################################################
#Librairies:

require 5.000 ;	# This is a perl 5.0 script
require MDL_sdf ;


use Getopt::Long ;
use File::Basename ;
$me = basename( $0, ".perl") ;

GetOptions('help', 'start:n' , 'end:n' , 'seqfile:s', 'labelfile:s', 'property_name:s', 'not', 'allow_duplicate', 'perlfile:s', 'perl_file:s');

if( defined($opt_help)  ) { 
	die( 
qq{Usage: $me [-start integer -end integer] [-seqfile file] [-labelfile file [-property_name prop] [-perlfile file.pm] [-not] < input.sdf > output.sdf

$me read a MDL SDFILE data file, select all records between start and end and output a MDL SDFILE.

Record separator is \$\$\$\$

Options:
	-seqfile
The list of record numbers to be selected can also be loaded from a file
with the option -seqfile.
An example of this file's content (numbers separated by newline and/or blanks ) :

2 4 567
34 566778

	-labelfile
Another way to select records is to filter them by labels (the first line of
each sdf record, or the value of the property specified by the -property_name option, see below)). The argument of the option -labelfile is the name of the file
that contains the labels , e.g.:
227832
3876
99938763
...

The order of the labels in this file is not important. In the present version, the labels may not contain blank characters.

Alternatively, one can give a property name to match the labels.
If the SDF contains property lines like this:
> <NSC>
118179

the argument to the option -property_name is NSC 
.


-not : inverse the selection

-perlfile : the argument is a file name containing some perl code that defines a new perl function.
By convention, the function name is is_sdf_record_kept .

 This function receives two arguments:
	1) pointer to the sdf entry
    2) record number
Note that values for command-line options are available as global variables (e.g. \$opt_property_name stands for the -property_name value).

 The function must return 1 or 0. When the return value is non zero, then the SDF entry will be output, otherwise it will be skipped. Use the -not option to inverse the selection.

Example:
##################################
sub is_sdf_record_kept
{
	my \$sdf_entry = shift ;    #retrieve first argument: pointer to the current sdf entry
	my \$record_number = shift ; #retrieve second argument: record number
	
	#Check the validity of the first argument
	defined \$sdf_entry || die "Assertion failed" ;
	
	
	#Retrieve the value for the key "NSC" from the SDF properties
	# of this entry
	my \$value = \$sdf_entry->data_for_field_name("NSC");
	
	#The data_for_field_name() function returns an undefined value
	#if the properties has not been found.
	#In this example, all entries from the NCI database must have an NSC field
	defined \$value || die "Fatal error: undefined NSC" ;


	printf STDERR "rec=%d NSC=%s\n", $record_number, $value;
	
	return \$value < 900000 ; #Keep NSC's < 900000
}
1;
##################################
\n}) ;

}

#############################################################################
#Check the options


if( defined $opt_end && defined $opt_start && defined $opt_seqfile)
{
	die "Do not mix options\n" ;
}

if( defined $opt_end && defined $opt_start && defined $opt_labelfile)
{
	die "Do not mix options\n" ;
}

if( defined($opt_seqfile)  +  defined($opt_labelfile) + defined($opt_perlfile) > 1)
{
	die "Do not mix options\n" ;
}



#############################################################################
#Not option
my $Not = 0 ;
if( defined $opt_not)
{
	$Not = 1 ;
}

sub bool_not_test
{
	my $bool = shift;
#	defined $bool || die "Assertion failed" ;
	
	if( $Not )
	{
		$bool = ! $bool ;
	}
	
	$bool ;
}
#############################################################################
# Select from range

if( defined $opt_end && defined $opt_start)
{
	$opt_end >= $opt_start || die "$opt_end  must be >= $opt_start" ;
	$current_record = 0 ;

	FOR_EACH_SDF_ENTRY: while( 1 )
	{ 
		my $sdf_entry = MDL_sdf_non_parsed_molecule->readFromInput() ; 
	
		defined $sdf_entry || last FOR_EACH_SDF_ENTRY ;  #end of the loop
		$current_record ++ ;
		
		if( bool_not_test(
			$current_record >= $opt_start  
			&& $current_record <= $opt_end))
		{
			$sdf_entry->write();
		}
	
		if( (! $Not) && $current_record > $opt_end ) {
			last FOR_EACH_SDF_ENTRY;
		}
	}
	
	exit() ;
}

#############################################################################
# Select from sequence of record numbers

if( defined $opt_seqfile )
{
	my $numbers = "" ;
	my $isValid = 0 ;
	my $numbers_of_written_records = 0 ;
		
	open( FILE,  $opt_seqfile)  || die "Can't open $opt_seqfile for reading\n" ;
	while(<FILE> )
	{
		chomp ;
		$numbers .= "$_ " ;
	}
	close FILE ;
	
	my @numbers = split /\s+/,  $numbers ;
	my @keep = () ;
	foreach $i (@numbers)
	{
		$keep[$i]=1;
	}

	
	$numbers_of_remaining_records = $#numbers + 1 ;

	$current_record = 0 ;

	FOR_EACH_SDF_ENTRY: while( 1 )
	{ 
		if( ! $Not) {
			$numbers_of_remaining_records > 0  || last FOR_EACH_SDF_ENTRY ;
		}
		
		my $sdf_entry = MDL_sdf_non_parsed_molecule->readFromInput() ; 
		defined $sdf_entry || last FOR_EACH_SDF_ENTRY ;  #end of the loop
		
		$current_record ++ ;
			
		if( bool_not_test($keep[$current_record]))
		{	
			printf STDERR "Record= %d   Label= %s\n", $current_record, $label ;
			$sdf_entry -> write();
			$numbers_of_written_records ++ ;
			if( ! $Not)
			{	$numbers_of_remaining_records -- ;	}			
			
		}
	 
	}
	
	if(! $Not &&  $numbers_of_remaining_records != 0 )
	{
		warn "Something went wrong. At the end of the input file, there is
still $numbers_of_remaining_records  records left to be read.\n" ;
	}
	
	printf STDERR  "The number of written records is %d.\n", 
			$numbers_of_written_records ;
				
	exit() ;
}

#############################################################################
# Select from a sequence of labels

if( defined $opt_labelfile )
{
	my $labels = "" ;
	my $isValid = 0 ;
	my @labels ;
	my %labels_dict ;
	my $numbers_of_written_records = 0 ;
		
	open( FILE,  $opt_labelfile ) || die "Can't open $opt_labelfile for reading\n" ;
	while(<FILE> )
	{
		chomp ;
		$labels .= "$_ " ;
	}
	close FILE ;
	
	@labels = split /\s+/,  $labels ;
	
	foreach $lab ( @labels)
	{
		if( defined( $labels_dict{$lab}))
		{
			warn "Duplicate label $lab in the file $opt_labelfile\n" ;
			defined $opt_allow_duplicate || die ;
		}
		$labels_dict{$lab} = 0 ;
	}

	my $numbers_of_remaining_records = $#labels + 1 ;

	$current_record = 0 ;

	FOR_EACH_SDF_ENTRY: while( 1 )
	{ 
		if( (! defined $opt_allow_duplicate) && (! $Not) )
		{
			$numbers_of_remaining_records > 0  || last FOR_EACH_SDF_ENTRY ;
		}
		
		my $sdf_entry = MDL_sdf_non_parsed_molecule->readFromInput() ; 
		defined $sdf_entry || last FOR_EACH_SDF_ENTRY ;  #end of the loop
		
		$current_record ++ ;
		
		my $label ;
		if( defined $opt_property_name)
		{
			$label = $sdf_entry->data_for_field_name($opt_property_name) ;
			defined $label || die "Undefined label for the property $opt_property_name for record $current_record" ;
		
		} else
		{
			$label = $sdf_entry->line1() ;
		}
		
		if( bool_not_test(defined $labels_dict{$label}) )
		{
			if( ! $labels_dict{$label} == 0 )
			{
				defined $opt_allow_duplicate || die "The label \"$label\" has been found twice in the SDF input file";
			}
			
			$sdf_entry->write() ;
			$numbers_of_written_records ++ ;
			$labels_dict{$label} ++ ;
			printf STDERR "Record= %d   Label= %s\n", 
					$current_record, $label ;
				
			if( ! $Not )
			{$numbers_of_remaining_records -- ;}
		}
 
	}
	
	if( ! $Not && $numbers_of_remaining_records != 0 )
	{
		warn "Something went wrong. At the end of the input file, there is (are)
still $numbers_of_remaining_records  record(s) left to be read.\n" ;
		
		warn "The not found labels are :\n";
		foreach $label (keys %labels_dict)
		{
			if( $labels_dict{$label} == 0)
			{
				warn "\t$label\n" ;
			}
		}
	}
	
	printf STDERR  "The number of written records is %d.\n", 
			$numbers_of_written_records ;
			
	exit() ;
}
#############################################################################
# Select from a user defined procedure
# This function receives two arguments:
#	 1) pointer to the sdf entry
#    2) record number
#See the help for an example
sub is_sdf_record_kept
{

	die "The function is_sdf_record_kept is not defined" ;
}

if( defined $opt_perlfile )
{

	require $opt_perlfile ; #Load code at run time
	use strict ;
	my $current_record = 0 ;
	
	FOR_EACH_SDF_ENTRY: while( 1 )
	{ 
		
		my $sdf_entry = MDL_sdf_non_parsed_molecule->readFromInput() ; 
		defined $sdf_entry || last FOR_EACH_SDF_ENTRY ;  #end of the loop
		
		$current_record ++ ;
		
		my $bool = is_sdf_record_kept( $sdf_entry, $current_record) ;
		if($Not)
		{
			$bool = ! $bool ;
		}
		if( $bool)
		{
			$sdf_entry->write() ;
		}
		
	}

	exit() ;
}
#############################################################################

die "No options selected or missing options.\nTry   $me -h\n" ;

#############################################################################




