#
#  Integrative_New_DDRs.pl
#
#  Created by Eduardo Andrés León on 2016-07-05.
#  Copyright (c) 2016 IPBLN. All rights reserved.
#
#!/usr/bin/perl
$|=1;
use strict;
use Getopt::Long;
use DBI;            
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;

my $help;
my $dbh;

my $ddr_file;
my $emergence_file;
GetOptions(
	"help" => \$help,
	"DDR|d=s" => \$ddr_file,
	"Emergence|e=s" => \$emergence_file
);
if($help){
	help();
}
elsif($ddr_file and $emergence_file){
	print STDERR date() . " Checking genes in database\n";
	
	my $genes;
	my $uniProt_id;
	my $uniProt_acc;
	my $data;
	
	my $lost;
	my $saved;
	my $tier;
	
	# =======================
	# = DATABASE CONNECTION =
	# =======================


	my $database='DDR_120115';
	my $hostname='demetriusIB';
	my $dsn = "DBI:mysql:database=$database;host=$hostname";
	my $user='ddr_admin';
	my $password='ddr_2014';
	$dbh = DBI->connect($dsn, $user,$password);

	my $ages = $dbh->selectall_hashref("SELECT id_age,gene_age FROM age",'gene_age') || die $dbh->errstr;
	my $network = $dbh->selectall_hashref("SELECT id_network,network FROM network",'network') || die $dbh->errstr;
	my $pathway = $dbh->selectall_hashref("SELECT id_pathway,pathway FROM pathway",'pathway') || die $dbh->errstr;

	#Family
	my $family = $dbh->selectrow_array("select max(id_family) from family") || die $dbh->errstr;
	$family++;
	
	#PTMs
	my $maxPTMs = $dbh->selectrow_array("select max(id_ptm) from PTMs") || die $dbh->errstr;
	$maxPTMs++;
	
	#Tier
	my $sth = $dbh->prepare("SELECT * FROM Tier") || die $dbh->errstr;
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	while(my @row = $sth->fetchrow_array) {
		$tier->{$row[1]}->{$row[2]}=$row[0];
	}
	my $MaxTier = $dbh->selectrow_array("select max(id_tier) from Tier") || die $dbh->errstr;
	
	#Entity
	my $maxEntity = $dbh->selectrow_array("select max(id_entity) from entity") || die $dbh->errstr;
	my $sth = $dbh->prepare("SELECT * FROM entity") || die $dbh->errstr;
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	while(my @row = $sth->fetchrow_array) {
		$genes->{$row[1]}=$row[0];
		$genes->{$row[2]}=$row[0];
		$uniProt_id->{$row[4]}=$row[0];
		$uniProt_acc->{$row[3]}=$row[0];
		
		$data->{$row[0]}->{accesion}=$row[3];
		$data->{$row[0]}->{name}=$row[2];
		$data->{$row[0]}->{gene}=$row[1];
		
	}
	
	#Emergence	
	my $sth = $dbh->prepare("select distinct o.*,a.gene_age,e.link from orthologs o, emergence e, age a where e.id_org=o.id_org and a.id_age=o.id_age") || die $dbh->errstr;
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	
	my $emergence;
	while(my @row = $sth->fetchrow_array) {
		$emergence->{$row[2]}->{id}=$row[0];
		$emergence->{$row[0]}->{code}=$row[1];
		$emergence->{$row[0]}->{organism}=$row[2];
		$emergence->{$row[0]}->{age}=$row[3];
		$emergence->{$row[0]}->{gene_age}=$row[4];
		$emergence->{$row[0]}->{link}=$row[5];
	}
	#Emergence insertados
	
	my $sth = $dbh->prepare("select * from emergence") || die $dbh->errstr;
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	
	my $emergence_inserted;
	while(my @row = $sth->fetchrow_array) {
		$emergence_inserted->{$row[0]}->{$row[1]}->{$row[2]}=$row[3];
	}
	
	my $pre_family;
	my $id_family;
	my $saved_family;
	open(DDR,$ddr_file) || die "$!: Can't read $ddr_file\n";
	while(<DDR>){
		chomp;
		if($_ !~ /^Uniprot/){
			my ($Uniprot,$Displacement,$Class_tree,$Agreement,$single_gene_tree,$multi_gene_tree,$Age,$Family,$Effector,$Mediator,$Sensor,$Traducer,$Network,$Pathway)=split(/\t/);
			$Uniprot=~s/\s+//g;
			
			if(!exists $pre_family->{$Family}){
				$id_family=$family++;
				$pre_family->{$Family}=$id_family;
			}
			if($Pathway =~/;/){
				my @pathways=split(/;/,$Pathway);
				foreach my $path(@pathways){
					if(!$pathway->{$path}->{id_pathway}){
						print STDERR "WARN:: Pathway not inserted for $Uniprot cause $path is not found\n";
						next;
					}
				}
			}
			else{
				if(!$pathway->{$Pathway}->{id_pathway}){
					print STDERR "WARN:: Pathway not inserted for $Uniprot cause $Pathway is not found\n";
					next;
				}
			}
			if(exists $uniProt_acc->{$Uniprot}){
				print STDERR "Ummm $Uniprot has been already inserted with id=". $uniProt_acc->{$Uniprot} ."\n";
			}
			else{
				print "Searching/Inserting for $Uniprot\n";
				$maxEntity++;
				if(!exists $saved->{$Uniprot}){
				
					my $info=get_info($Uniprot);
				
					#print $Uniprot ."\t-". $info->{$Uniprot}->{GN} ."-\t". $info->{$Uniprot}->{ID} ."\t" . $info->{$Uniprot}->{ENS} ."\t" .$info->{$Uniprot}->{DE} ."\t[" . $ages->{$Age}->{id_age} .":$Age]\t$Family\[".$pre_family->{$Family}."\]\n";# . $info->{$Uniprot}->{SEQ} ."\n";
				
					#print "$Uniprot\t$Pathway\[".$pathway->{$Pathway}->{id_pathway}."\]\t$Network\[" .$network->{$Network}->{id_network}."\]\n";
					#check all NOT NULL fields before inserting
					
					if($maxEntity and $info->{$Uniprot}->{GN} and $info->{$Uniprot}->{ID} and $info->{$Uniprot}->{SEQ} and $ages->{$Age}->{id_age} and $pre_family->{$Family} and $network->{$Network}->{id_network}){
						
						#Insert family
						if(!exists $saved_family->{$pre_family->{$Family}}){
							$dbh->do('INSERT INTO family VALUES (?,?)',undef,$pre_family->{$Family},$pre_family->{$Family});
							$saved_family->{$pre_family->{$Family}}++;
						}
						
						#Insert entity;
						$dbh->do('INSERT INTO entity (id_entity,`Gene Name`,`Uniprot Id`,`Uniprot Name`,Description,`Ensembl Id`,Organism,Displacement,class_tree,agreement,single_gene_tree,multi_gene_tree,sequence,id_age,id_family) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
						 	undef,
						 	$maxEntity,
						 	$info->{$Uniprot}->{GN},
						 	$Uniprot,
							$info->{$Uniprot}->{ID},
							$info->{$Uniprot}->{DE},
							$info->{$Uniprot}->{ENS},
							"Homo_sapiens",
							$Displacement,
							$Class_tree,
							$Agreement,
							$single_gene_tree,
							$multi_gene_tree,
							$info->{$Uniprot}->{SEQ},
							$ages->{$Age}->{id_age},
							$pre_family->{$Family},
	 					);
						#effector
						
						if($Effector){
							if($Effector =~/;/){
								my @pmids=split(/;/,$Effector);
								foreach my $pmid(@pmids){
									# if(exists $tier->{Effector}->{$pmid}){
									# 	#Tier already saved
									# 	#Tier rel
									# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Effector}->{$pmid});
									# }
									# else{
										#Tier 
										$MaxTier++;
										$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Effector',$pmid);
										#Tier rel
										$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
										
									# }
								}
							}
							else{
								# if(exists $tier->{Effector}->{$Effector}){
								# 	#Tier already saved
								# 	#Tier rel
								# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Effector}->{$Effector});
								# }
								# else{
									#Tier 
									$MaxTier++;
									$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Effector',$Effector);
									#Tier rel
									$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
									
								#}
							}
						}
						
						#mediator
						if($Mediator){
							if($Mediator =~/;/){
								my @pmids=split(/;/,$Mediator);
								foreach my $pmid(@pmids){
									# if(exists $tier->{Mediator}->{$pmid}){
									# 	#Tier already saved
									# 	#Tier rel
									# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Mediator}->{$pmid});
									# }
									# else{
										#Tier 
										$MaxTier++;
										$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Mediator',$pmid);
										#Tier rel
										$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
										
									#}
								}
							}
							else{
								# if(exists $tier->{Mediator}->{$Mediator}){
								# 	#Tier already saved
								# 	#Tier rel
								# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Mediator}->{$Mediator});
								# }
								# else{
									#Tier 
									$MaxTier++;
									$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Mediator',$Mediator);
									#Tier rel
									$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
									
								#}
							}
						}
						#sensor
						if($Sensor){
							if($Sensor =~/;/){
								my @pmids=split(/;/,$Sensor);
								foreach my $pmid(@pmids){
									# if(exists $tier->{Sensor}->{$pmid}){
									# 	#Tier already saved
									# 	#Tier rel
									# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Sensor}->{$pmid});
									# }
									# else{
									# 	#Tier
										$MaxTier++;
										$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Sensor',$pmid);
										#Tier rel
										$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
										
									#}
								}
							}
							else{
								# if(exists $tier->{Sensor}->{$Sensor}){
								# 	#Tier already saved
								# 	#Tier rel
								# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Sensor}->{$Sensor});
								# }
								# else{
									#Tier 
									$MaxTier++;
									$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Sensor',$Sensor);
									#Tier rel
									$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
									
								#}
							}
						}
						#Traducer
						if($Traducer){
							if($Traducer =~/;/){
								my @pmids=split(/;/,$Traducer);
								foreach my $pmid(@pmids){
									# if(exists $tier->{Traducer}->{$pmid}){
									# 	#Tier already saved
									# 	#Tier rel
									# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Traducer}->{$pmid});
									# }
									# else{
										#Tier 
										$MaxTier++;
										$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Traducer',$pmid);
										#Tier rel
										$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
										
									#}
								}
							}
							else{
								# if(exists $tier->{Traducer}->{$Traducer}){
								# 	#Tier already saved
								# 	#Tier rel
								# 	$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$tier->{Traducer}->{$Traducer});
								# }
								# else{
									#Tier 
									$MaxTier++;
									$dbh->do('INSERT INTO Tier VALUES (?,?,?)',undef,$MaxTier,'Traducer',$Traducer);
									#Tier rel
									$dbh->do('INSERT INTO Tier_rel VALUES (?,?)',undef,$maxEntity,$MaxTier);
									
								#}
							}
						}
						#insert Pathways
						if($Pathway =~/;/){
							my @pathways=split(/;/,$Pathway);
							$dbh->do('INSERT INTO network_rel VALUES (?,?)',undef,$maxEntity,$network->{$Network}->{id_network});
							foreach my $path(@pathways){
								if($pathway->{$path}->{id_pathway}){
									$dbh->do('INSERT INTO pathway_rel VALUES (?,?)',undef,$maxEntity,$pathway->{$path}->{id_pathway});
								}
								else{
									print STDERR "WARN:: Patway not inserted for $Uniprot cause $path is not found\n";
								}
							}
						}
						else{
							if($pathway->{$Pathway}->{id_pathway}){
								$dbh->do('INSERT INTO network_rel VALUES (?,?)',undef,$maxEntity,$network->{$Network}->{id_network});
								$dbh->do('INSERT INTO pathway_rel VALUES (?,?)',undef,$maxEntity,$pathway->{$Pathway}->{id_pathway});
							}
							else{
								print STDERR "WARN:: Patway not inserted for $Uniprot cause $Pathway is not found\n";
							}

						}
						#min PTMs
						$dbh->do('INSERT INTO PTMs VALUES (?,?,?,?)',undef,$maxPTMs,$maxEntity,$maxEntity,5);
						$maxPTMs++;
						$saved->{$Uniprot}++;
					 }
					 else{
						 print STDERR "\nWARN :: $Uniprot has not been stored cause we found missing data: id_entity ($maxEntity); GeneName (".$info->{$Uniprot}->{GN}."); UniprotID (".$info->{$Uniprot}->{ID}."); Age (".$ages->{$Age}->{id_age}."); Family (".$pre_family->{$Family}."); Seq [length shown] (".length($info->{$Uniprot}->{SEQ})."); Network (".$network->{$Network}->{id_network}."); Pathway (".$pathway->{$Pathway}->{id_pathway}.")\n\n";
						 
					 }
				}
			}
		}
	}
	close DDR;
	
	open(EMERGENCE,$emergence_file) || die $!;
	my $saved_emergence;
	while(<EMERGENCE>){
		chomp;
		if($_ !~ /^Uniprot/){
			my($uniprot,$organism,$id_orth)=split(/\t/);
			if($uniProt_acc->{$uniprot} and $emergence->{$organism}->{id}){
				#print "$uniprot (".$uniProt_acc->{$uniprot}.")\t-$organism- (".$emergence->{$organism}->{id}.")\t".$id_orth ."\n" if(!$emergence->{$organism}->{id});
				if(!exists $saved_emergence->{$uniProt_acc->{$uniprot}}->{$emergence->{$organism}->{id}}){
					if(!exists $emergence_inserted->{$uniProt_acc->{$uniprot}}->{$emergence->{$organism}->{id}}->{$id_orth}){
						$dbh->do('INSERT INTO emergence VALUES (?,?,?,?)',undef,$uniProt_acc->{$uniprot},$emergence->{$organism}->{id},$id_orth,$emergence->{$emergence->{$organism}->{id}}->{link});
						$saved_emergence->{$uniProt_acc->{$uniprot}}->{$emergence->{$organism}->{id}}++;
					}
					else{
						print "Uniprot: $uniprot and $id_orth has been already inserted\n";
					}
				}
			}
			else{
				print "Missing data: Uniprot $uniprot (".$uniProt_acc->{$uniprot}."), organism: $organism (".$emergence->{$organism}->{id}.")\n";
			}
			
		}
	}
	close EMERGENCE;
}
else{
	help();
}	
sub help{
        my $usage = qq{
          $0 

            Getting help:
                [--help]

            Needed parameters:
              [DDR|d] : template with new DDR
			  [Emergence|e] : template with emergence data             
			   
            Examples:
              perl $0 -d template_new_DDRs_AR1.txt -e template_new_DDRs_AR1_emergence.txt
                       
 };

print STDERR $usage;
exit();
        
}

sub date{
        my $date=("date \"+%D %H:%M:%S\"");
        $date=`$date`;
        chomp $date;
        return("[".$date."]");
}		

sub get_info{
	my $uniprot=shift;
	my $file=get_uniprot($uniprot);
	
	my $result;
	
	open(SWS,$file) || die "$!";
	# Por seguridad, limpio las variables
	my @acc=();
	my $id='';
	my $lastdate='';
	my $description='';
	my $sequence='';
	my $molw='';
	my $readingseq=undef;
	my $ensembl;
	my $genename;
	# Procesamiento del fichero, leyendo línea a línea
	while(my $line=<SWS>) {
		# Lo primero, quitar el salto de línea
		chomp($line);
	
		# Detección del final del registro
		if($line =~ /^\/\//) {
		
			$description=undef  if($description eq '');
		
			# Impresión de comprobación
			#print "ACC: $acc[0] ; ID: $id ; ENS: -$ensembl- : DE: $description ; SEQ $sequence\n";
			
			$result->{$uniprot}->{ID}=$id;
			$result->{$uniprot}->{ENS}=$ensembl;
			$result->{$uniprot}->{DE}=$description;
			$result->{$uniprot}->{SEQ}=$sequence;
			$result->{$uniprot}->{GN}=$genename;
			
			# Una vez guardados los datos, tenemos
			# que borrar el contenido de las variables
			# para evitar fallos en las iteraciones
			@acc=();
			$id='';
			$lastdate='';
			$description='';
			$sequence='';
			$molw='';
			$readingseq=undef;
			$ensembl='';
			$genename='';
		}
	
		# ¿Estoy leyendo una secuencia?
		if(defined($readingseq)) {
			# Quito todos los espacios intermedios 
			$line =~ s/\s+//g;
		
			# Y concateno
			$sequence .= $line;
		
		# Como no la estoy leyendo, busco los patrones apropiados
		} elsif($line =~ /^SQ.+[^0-9](\d+) MW/) {
			# Extracción del peso molecular
			# y comienzo de secuencia
			$molw=$1;
		
			$readingseq=1;
	
		} elsif($line =~ /^ID   ([a-zA-Z0-9_]+)/) {
			# Identificador
			$id = $1;
		}elsif($line =~ /^AC   (.+)/) {
			# Los accnumber, que pueden estar en varias líneas
			my $ac=$1;	
			# Elimino los espacios
			$ac =~ s/\s+//g;
		
			# Rompo por los puntos y coma, y
			# añado a la lista de accnumber
			push(@acc,split(/;/,$ac));
		} elsif($line =~ /^DE            EC=.+;/ or $line =~ /^DE   RecName: Full=(.+);/) {
			# La descripción, que puede estar en varias líneas
			#$description .= ', EC '  if($description ne '');
			$description .= $1;
		}elsif($line =~ /^DR\s+Ensembl;\s+(.+)/){
			my $ensembls=$1;
			$ensembls=~s/\.\s+\[.+\]//g;
			$ensembls=~s/\s+//g;
			my @ens=split(/;/,$ensembls);
			$ensembl=$ens[2];
			$ensembl =~s/\.//g;
		}elsif($line =~ /^GN   Name=(.+)[;\s]{0,1}/){
			$genename=$1;
			$genename=~s/;.*//g;
			$genename=~s/^(\w+)\s+.*/$1/g;
		}
	}

	# Se cierra el fichero procesado
	close(SWS);
	return($result);
}
	
sub get_uniprot{
	my $id_name=shift;
	#print STDERR "Downloading http://www.uniprot.org/uniprot/$id_name.txt\n";
	my $response = $ua->get("http://www.uniprot.org/uniprot/$id_name.txt");
	my $tmp_file="/tmp/.$id_name.txt";
	open(TMP,">$tmp_file") || die $!;
	
	if ($response->is_success) {
	    print TMP $response->decoded_content;  # or whatever
		
	}
	else {
	    die $response->status_line;
	}
	close TMP;
	return($tmp_file) if ($response->is_success);
}
	