#!/usr/local/bin/perl


use Getopt::Long ;
use File::Basename ;
$me = basename( $0, ".perl") ;

$ret = GetOptions('help', 'match_key=s',  'noskip', 'perlclass:s', 'table_file=s', 'multi', 'silent');
$ret || die "Options are not correct. Try $me -help\n" ;

if( defined($opt_help)  ) {
	usage() ;
} 


################################################################################
#Options:
#	$match_key : the label used to match the entries of the SDF files and the properties table

my $match_key = $opt_match_key ;
defined $match_key || die "Undefined option. Try $me -help\n";

my $table_file_name = $opt_table_file ;
defined $table_file_name || die "Undefined option. Try $me -help\n";

################################################################################
#Libraries:

require 5.000 ;	# This is a perl 5.0 script

require MDL_sdf ;



if(! defined $opt_perlclass)
{
	$opt_perlclass = CSV ;
}

#remove the .pm in case there would be one
$opt_perlclass =~ s/(.+)\.pm$/$1/ ;
require "${opt_perlclass}.pm";


$TableClass = sprintf "%s_table", $opt_perlclass ;


################################################################################


#read the whole table
#$table = CSV_table->readFromInputFile($table_file_name) ;
#$table = CSV_table->readFromInputFileUsingQuickKey($table_file_name, $match_key) ;
$table = $TableClass->readFromInputFileUsingQuickKey($table_file_name, $match_key) ;

################################################################################


$record = 0 ;

#foreach $sdf_entry read from from STDIN
FOR_EACH_SDF_ENTRY: while( 1 )
{ 

	#my $sdf_entry = MDL_sdf->readFromInput() ;
	my $sdf_entry = MDL_sdf_non_parsed_molecule->readFromInput() ; #faster
	
	defined $sdf_entry || last FOR_EACH_SDF_ENTRY ;  #end of the loop

	$record ++ ; #just to keep track of the record number
	
	
	#get the value of its key ($match_key)
	my $sdf_id = $sdf_entry->data_for_field_name($match_key) ;
	if( ! defined $sdf_id )
	{
		defined $opt_silent || warn "The key $match_key is not defined for entry $record.\n";
		if(!defined  $opt_noskip)
		{
			defined $opt_silent || warn " Skipping\n";
			next FOR_EACH_SDF_ENTRY;
		}
	}else
	{
		defined $opt_silent || printf STDERR "%4d %s => %s\n", $record, $match_key, $sdf_id ;
		#get the list of new properties from table using the SDF entry's key
		
		$propertiesDict = $table->getProperties( $match_key , $sdf_id) ;
		#$propertiesDict is a ref to a hash table.

		if( ! defined %$propertiesDict )
		{
			defined $opt_silent || warn "$match_key => $sdf_id: data not available in the table.\n";
			if(!defined  $opt_noskip)
			{
				defined $opt_silent || warn " Skipping\n";
				next FOR_EACH_SDF_ENTRY;
			}		
		}
		
		if(defined $opt_multi) 
		{	
			my $props = $table->get_multi_Properties( $match_key , $sdf_id) ;
			my $i = 0 ;
			foreach $propertiesDict (@$props)
			{
				$i++ ;
				#add all the new properties
				$sdf_entry->addReplaceProperties( $propertiesDict);
				#write the sdf entry
				$sdf_entry->write();
				defined $opt_silent || printf STDERR "%d ", $i ;
				
			}
			defined $opt_silent || print STDERR "\n";	
				
		
		}else #No option #single property row
		{	

			#add all the new properties
			$sdf_entry->addReplaceProperties( $propertiesDict);
			#write the sdf entry
			$sdf_entry->write();
	
		}
	}
	

	
}

################################################################################

sub usage
{
	die( 
	"Example usage : $me [-help] -match_key NSC -table_file logP.csv [-perlclass CSV] [-noskip] [-silent] < input.sdf > output.sdf
$me reads a SDF file (MDL format), adds properties from a comma separated values (CSV) file and print out the new SDF file.

Options:
	-match_key: the label used to match the entries of the SDF files and the properties table

	-table_file: the file name of the properties table

    -multi: if the table contains more than one entry for the match key, 
    the output will contain several entries (with the same the chemical structure)

	-noskip: keep all entries from the inputfile, even those without a property defined in the table

	-perlclass: specify the type of table used. The argument is a perl package name. For instance, CSV (separated comma values), stands for the perl package CSV.pm. Other possiblilities are : SSV (space separated values), NCI_screen (NCI screen data), ... or any new not yet available packages. The default is CSV.
	
	-silent : no verbose messages
	 
Notes : 
	- Options can be abbreviated as long as they are unambiguous.
	- The output file is compatible with ChemFinder, Cactus's csbr, ...
	- Large tables can require a lot of memory.
	- For NCI_screen data, look at the program nciscreen2csv to preprocess the data
	
	
##################################################################	
Complete example:
	$me -match_key NSC -table_file sample.csv < sample.sdf > new.sdf


	A SDF file (sample.sdf):
-----------------------------------------------------------------
MDL Molfile
csChmFindW1127982019
[comments]
 12 11  0     0  0  0  0  0  0  1 V2000
    5.5716    4.2788    0.0162 C   0  0  0  0  0  0  0  0  0  0
    5.6039    1.8954    0.0031 C   0  0  0  0  0  0  0  0  0  0
    7.7741    5.0914   -0.0005 C   0  0  0  0  0  0  0  0  0  0
    4.4950    5.0162   -1.7874 O   0  0  0  0  0  0  0  0  0  0
    3.4015    1.0829    0.0198 C   0  0  0  0  0  0  0  0  0  0
    6.6458    1.1777   -1.8285 O   0  0  0  0  0  0  0  0  0  0
    9.0833    4.5955    1.5528 O   0  0  0  0  0  0  0  0  0  0
    8.3615    6.1895   -1.4119 O   0  0  0  0  0  0  0  0  0  0
    2.1222    1.5622    1.6030 O   0  0  0  0  0  0  0  0  0  0
    2.7871    0.0000   -1.3920 O   0  0  0  0  0  0  0  0  0  0
   11.2055    5.3784    1.5369 C   0  0  0  0  0  0  0  0  0  0
    0.0000    0.7791    1.6191 C   0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  1  1  0  0  0  0
  5  2  1  0  0  0  0
  6  2  1  0  0  0  0
  7  3  1  0  0  0  0
  8  3  2  0  0  0  0
  9  5  1  0  0  0  0
 10  5  2  0  0  0  0
 11  7  1  0  0  0  0
 12  9  1  0  0  0  0
M  END
>  <MOL_ID> (500)
500

>  <Formula> (500)
C6H10O6

>  <MolWeight> (500)
178.14

>  <NSC> (500)
517

>  <CAS_RN> (500)
89599-43-9

\$\$\$\$
MDL Molfile
csChmFindW1127982019
[comments]
  8  7  0     0  0  0  0  0  0  1 V2000
    1.2944    4.2707    0.0095 C   0  0  0  0  0  0  0  0  0  0
    1.3135    2.8708    0.0018 C   0  0  0  0  0  0  0  0  0  0
    2.6079    4.7553   -0.0004 C   0  0  0  0  0  0  0  0  0  0
    0.0000    2.3863    0.0118 C   0  0  0  0  0  0  0  0  0  0
    2.5890    6.1552    0.0073 C   0  0  0  0  0  0  0  0  0  0
    0.0183    1.0394    0.0044 C   0  0  0  0  0  0  0  0  0  0
    3.8500    6.6205   -0.0022 N   0  0  0  0  0  0  0  0  0  0
    0.0324    0.0000   -0.0013 N   0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  2  1  0  0  0  0
  5  3  1  0  0  0  0
  6  4  1  0  0  0  0
  7  5  1  0  0  0  0
  8  6  3  0  0  0  0
M  END
>  <MOL_ID> (493)
493

>  <Formula> (493)
C6H12N2

>  <MolWeight> (493)
112.17

>  <NSC> (493)
510

>  <CAS_RN> (493)
2432-74-8

\$\$\$\$
-----------------------------------------------------------------




A table file (CSV format) :


-----------------------------------------------------------------

NSC,LogP,Ki,comment
510,2.90,59,\"Quite good, but toxic\"
517,5.2,120,\"Inactive, forget it\"

-----------------------------------------------------------------






In this example, the NSC is the match key, i.e. it must be present in both input files.
An exact string match is used to link the entries of the SDF with the rows the table.

The properties ordering of the table and SDF file is not important.
The match_key column of the table does not need to be the first column of the file.


The output file new.sdf (pay attention to the new '> <XXXX>' data field entries):


-----------------------------------------------------------------
MDL Molfile
csChmFindW1127982019
[comments]
 12 11  0     0  0  0  0  0  0  1 V2000
    5.5716    4.2788    0.0162 C   0  0  0  0  0  0  0  0  0  0  0  0
    5.6039    1.8954    0.0031 C   0  0  0  0  0  0  0  0  0  0  0  0
    7.7741    5.0914   -0.0005 C   0  0  0  0  0  0  0  0  0  0  0  0
    4.4950    5.0162   -1.7874 O   0  0  0  0  0  0  0  0  0  0  0  0
    3.4015    1.0829    0.0198 C   0  0  0  0  0  0  0  0  0  0  0  0
    6.6458    1.1777   -1.8285 O   0  0  0  0  0  0  0  0  0  0  0  0
    9.0833    4.5955    1.5528 O   0  0  0  0  0  0  0  0  0  0  0  0
    8.3615    6.1895   -1.4119 O   0  0  0  0  0  0  0  0  0  0  0  0
    2.1222    1.5622    1.6030 O   0  0  0  0  0  0  0  0  0  0  0  0
    2.7871    0.0000   -1.3920 O   0  0  0  0  0  0  0  0  0  0  0  0
   11.2055    5.3784    1.5369 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    0.7791    1.6191 C   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  1  1  0  0  0  0
  5  2  1  0  0  0  0
  6  2  1  0  0  0  0
  7  3  1  0  0  0  0
  8  3  2  0  0  0  0
  9  5  1  0  0  0  0
 10  5  2  0  0  0  0
 11  7  1  0  0  0  0
 12  9  1  0  0  0  0
M  END
>  <MOL_ID> (500)
500

>  <Formula> (500)
C6H10O6

>  <MolWeight> (500)
178.14

>  <NSC> (500)
517

>  <CAS_RN> (500)
89599-43-9

>  <LogP>
5.2

>  <comment>
Inactive, forget it

>  <Ki>
120

\$\$\$\$
MDL Molfile
csChmFindW1127982019
[comments]
  8  7  0     0  0  0  0  0  0  1 V2000
    1.2944    4.2707    0.0095 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.3135    2.8708    0.0018 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.6079    4.7553   -0.0004 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0000    2.3863    0.0118 C   0  0  0  0  0  0  0  0  0  0  0  0
    2.5890    6.1552    0.0073 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.0183    1.0394    0.0044 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.8500    6.6205   -0.0022 N   0  0  0  0  0  0  0  0  0  0  0  0
    0.0324    0.0000   -0.0013 N   0  0  0  0  0  0  0  0  0  0  0  0
  2  1  1  0  0  0  0
  3  1  1  0  0  0  0
  4  2  1  0  0  0  0
  5  3  1  0  0  0  0
  6  4  1  0  0  0  0
  7  5  1  0  0  0  0
  8  6  3  0  0  0  0
M  END
>  <MOL_ID> (493)
493

>  <Formula> (493)
C6H12N2

>  <MolWeight> (493)
112.17

>  <NSC> (493)
510

>  <CAS_RN> (493)
2432-74-8

>  <LogP>
2.90

>  <comment>
Quite good, but toxic

>  <Ki>
59

\$\$\$\$
---------------------------------------------------------------------------
\n") ;
}
