#
#  Integrative_pairs_all_tumors.pl
#
#  Created by Eduardo Andrés León on 2016-06-27.
#  Copyright (c) 2016 IPBLN. All rights reserved.
#
#!/usr/bin/perl
$|=1;
use strict;
use Getopt::Long;
use DBI;            

my $help;
my $dbh;
my $ana;
GetOptions(
	"help" => \$help,
	"ana" => \$ana
);
if($help){
	help();
	
}
else{
	print STDERR date() . " Checking genes in database\n";
	
	# =======================
	# = DATABASE CONNECTION =
	# =======================


	my $database='DDR_120115';
	my $hostname='demetriusIB';
	my $dsn = "DBI:mysql:database=$database;host=$hostname";
	my $user='ddr_admin';
	my $password='ddr_2014';
	$dbh = DBI->connect($dsn, $user,$password);

	my $sth = $dbh->prepare("SELECT * FROM entity") || die $dbh->errstr;
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	
	my $genes;
	my $uniProt_id;
	my $uniProt_acc;
	my $data;
	
	my $lost;
	
	while(my @row = $sth->fetchrow_array) {
		$genes->{$row[1]}=$row[0];
		$genes->{$row[2]}=$row[0];
		$uniProt_id->{$row[4]}=$row[0];
		$uniProt_acc->{$row[3]}=$row[0];
		
		$data->{$row[0]}->{accesion}=$row[3];
		$data->{$row[0]}->{name}=$row[2];
		$data->{$row[0]}->{gene}=$row[1];
		
	}
	
	my $maxPTMs = $dbh->selectrow_array("select max(id_ptm) from PTMs") || die $dbh->errstr;
	my $modifications = $dbh->selectall_hashref("SELECT id_modification,modification FROM modification",'modification') || die $dbh->errstr;
	
	my $already_saved=get_PTMs_from_db();
	
	my $partial_PTM=get_partial_PTMs_from_db();
	
	my $output="missing_PTMS_$$.xls";
	if($ana){
		open(OUT,">$output")|| die $!;
	}
	############################ Manually curated #######################################################################
	open(MANUAL,"Manually_curated_PTMs.txt") || die $!;
	while(<MANUAL>){
		chomp;
		my($target,$modification,$residue,$PMID,$modifier)=split(/\t/);
		my $id_modification=$modifications->{ucfirst(lc($modification))}->{id_modification};
		my $comprueba_PTM=$genes->{$target} ."\t" . $genes->{$modifier} ."\t". $id_modification ."\t". $residue;
		#print  $maxPTMs ."\t[$target]\t[". $genes->{$target} ."]\t[". $genes->{$modifier} ."]\t[". $id_modification ."]\n";
		
		if(!exists $already_saved->{$comprueba_PTM}){
			$maxPTMs++;
			$dbh->do('INSERT INTO PTMs (id_ptm,target_id,modifier_id,id_modification) VALUES (?,?,?,?)',
				undef,
				$maxPTMs,
				$genes->{$target},
				$genes->{$modifier},
				$id_modification
			);

			$dbh->do('INSERT INTO residues (id_ptm,residue,pmid) VALUES (?,?,?)',undef,$maxPTMs,$residue,$PMID);
		}
		else{
			print STDERR "$comprueba_PTM exits\n";
		}
	}
	close MANUAL;
	
	
	############################ PIR #######################################################################
	
	if(!-e "ptm.txt"){
		print STDERR date() . " File for ptms is missing, downloading from ftp://ftp.pir.georgetown.edu/databases/iptmnet/ptm.txt\n";
		system("wget ftp://ftp.pir.georgetown.edu/databases/iptmnet/ptm.txt");
	}else{	
		if(!-e "ptm.txt"){
			print STDERR date() . " File for ptms is STILL missing, Network problems ?\nQuitting\n\n";
		}
		print STDERR date() . " Processing ptm.txt\n";
		
		my $saved;
		open(PTM,"ptm.txt") || die $!;
		while(<PTM>){
			chomp;
			my($modification,$db,$gene1,$synonym1,$org,$residue,$gene2,$synonym2,$type,$pmids)=split(/\t/);
			
			$synonym1=uc($synonym1);
			$modification=ucfirst(lc($modification));
			my $id_modification=$modifications->{ucfirst(lc($modification))}->{id_modification};
			
			if(($modification eq "ACETYLATION" or $modification eq "SUMOYLATION" or $modification eq "UBIQUITYLATION" or $modification eq "PHOSPHORYLATION") and ($org eq "Homo sapiens (Human)") and $pmids and $gene2){
				if(exists $genes->{uc($synonym1)} and exists $genes->{uc($synonym2)}){
				#if(exists $genes->{uc($synonym1)}){
					my $comprueba_PTM=$genes->{$synonym1} ."\t" . $genes->{$synonym2} ."\t". $id_modification ."\t$residue";
					#print "$synonym1 (".$genes->{$synonym1}.")\t$synonym2 (".$genes->{$synonym2}.")\t$modification ($id_modification)\t$residue\t$pmid\n";
					if(exists $already_saved->{$comprueba_PTM}){
						# print STDERR "Repito el PTM " . $already_saved->{$comprueba_PTM} ."\n";
					}
					else{
						if(!exists $saved->{$comprueba_PTM} and $genes->{uc($synonym2)}){
							$maxPTMs++;
							my @pmid=split(",",$pmids);
							$dbh->do('INSERT INTO PTMs (id_ptm,target_id,modifier_id,id_modification) VALUES (?,?,?,?)',undef,$maxPTMs,$genes->{$synonym1},$genes->{$synonym2},$id_modification);
							$dbh->do('INSERT INTO residues (id_ptm,residue,pmid) VALUES (?,?,?)',undef,$maxPTMs,$residue,$pmid[0]);
							$saved->{$comprueba_PTM}++;

						}
					}
				}
			}
			if(exists $genes->{uc($synonym1)} and !$synonym2 and $pmids and !exists $partial_PTM->{$genes->{$synonym1} ."\t". $id_modification}){
				#For Ana
				$pmids=~s/,/;/g;
				$lost->{"$synonym1\t$modification\t$residue\t$pmids"}++;
				print OUT "$synonym1\t$modification\t$residue\t$pmids\n" if($ana);
			}
		}
		close PTM;
		
		my $already_saved=get_PTMs_from_db();
		
		my $partial_PTM=get_partial_PTMs_from_db();
		
		
		############################ dBPTM #######################################################################
		
		
		my $saved;
		my $otro_saved;
		if(!-e "dbPTM3.txt"){
			print STDERR date() . " File for ptms is missing, downloading from http://dbptm.mbc.nctu.edu.tw/download/dbPTM.tgz\n";
			system("wget http://dbptm.mbc.nctu.edu.tw/download/dbPTM.tgz");
			system("tar -xzf dbPTM.tgz");
		}else{	
			if(!-e "dbPTM3.txt"){
				print STDERR date() . " File for ptms is STILL missing, Network problems ?\nQuitting\n\n";
			}
		}
		print STDERR date() . " Processing dbPTM3.txt\n";
		open(PTM,"dbPTM3.txt") || die $!;
		while(<PTM>){
			chomp;
			my($uniprotName,$uniprotACC,$residueNumber,undef,$pmids,$source,$residueLetter,$modification)=split(/\t/);
			my $modifier;
			chop($modification);
			$pmids=~s/-//g;
			
			my $id_modification=$modifications->{ucfirst(lc($modification))}->{id_modification};
			
			if(exists $uniProt_id->{uc($uniprotName)} and $pmids){
				if($modification =~/\(/){
					#print $uniprotName ."\t$residueNumber$residueLetter\t" . $modification ."\n";
					$modification=~s/\s+\((.+)\)//g;
					$modifier=$1;
					my @pmid=split(",",$pmids);
				
					$id_modification=$modifications->{ucfirst(lc($modification))}->{id_modification};
					
					#print $uniprotName ."\t$residueNumber$residueLetter\t" . $modification ."\t$modifier\n";
					if(!exists $otro_saved->{"$uniprotName\t".$data->{$uniProt_id->{$uniprotName}}->{gene}."\t$residueLetter$residueNumber\t$modification"}){
						#print $data->{$uniProt_id->{$uniprotName}}->{gene}." [".$uniProt_id->{$uniprotName}."]\t$modifier [".$genes->{$modifier}."]\t$residueLetter$residueNumber\t$modification [".$id_modification."]\t$pmid[0]\n";
						my $comprueba_PTM=$uniProt_id->{$uniprotName} ."\t". $genes->{$modifier} ."\t". $id_modification ."\t$residueLetter$residueNumber";
						if($genes->{$modifier} and !exists $already_saved->{$comprueba_PTM}){
							$maxPTMs++;
							$dbh->do('INSERT INTO PTMs (id_ptm,target_id,modifier_id,id_modification) VALUES (?,?,?,?)',
								undef,
								$maxPTMs,
								$uniProt_id->{$uniprotName},
								$genes->{$modifier},
								$id_modification
							);

							$dbh->do('INSERT INTO residues (id_ptm,residue,pmid) VALUES (?,?,?)',undef,$maxPTMs,$residueLetter.$residueNumber,$pmid[0]);
							#print "$pmid[0]\n";
							$otro_saved->{"$uniprotName\t".$data->{$uniProt_id->{$uniprotName}}->{gene}."\t$residueLetter$residueNumber\t$modification"}++;
						
						}
					}
				}
				else{
					if($modification eq "Phosphorylation" or $modification eq "Acetylation" or $modification eq "Sumoylation" or $modification eq "Ubiquitylation"){
						if(!exists $saved->{"$uniprotName\t".$data->{$uniProt_id->{$uniprotName}}->{gene}."\t$residueLetter$residueNumber\t$modification"}){
							#print "$uniprotName\t".$data->{$uniProt_id->{$uniprotName}}->{gene}."\t$residueLetter$residueNumber\t$modification\t$pmids\n";
							$saved->{"$uniprotName\t".$data->{$uniProt_id->{$uniprotName}}->{gene}."\t$residueLetter$residueNumber\t$modification"}++;
							
							if(exists $uniProt_id->{$uniprotName} and !$modifier and $pmids and !exists $partial_PTM->{$uniProt_id->{$uniprotName} ."\t". $id_modification}){
								$lost->{$data->{$uniProt_id->{$uniprotName}}->{gene} ."\t".$modification."\t". $residueLetter.$residueNumber ."\t$pmids"}++;
							}
							
						}
					}
				}
			}
		}
		close PTM;
		
		print STDERR date() . " A total of new ". scalar(keys %$otro_saved) . " PTMs have been inserted in $database\n";
		
		my $data;
		my $cont_pmid=0;
		foreach my $los(keys %$lost){
			my($gene,$modification,$residue,$pmids)=split(/\t/,$los);
			my @all_pmids=split(/;/,$pmids);
			foreach my $pmid (@all_pmids){
				$data->{$gene}->{$modification}->{$residue}->{$pmid}++;
			}
		}
		foreach my $gen(sort {$a cmp $b} %$data){
			foreach my $mod (sort keys %{$data->{$gen}}){
				foreach my $res (sort {$a <=> $b} keys %{$data->{$gen}->{$mod}}){
					#print $gen ."\t" . $mod ."\t". $res ."\t". join(";", keys %{$data->{$gen}->{$mod}->{$res}}) ."\n";
					foreach my $pm(keys %{$data->{$gen}->{$mod}->{$res}}){
						$cont_pmid++;
					}
				}
			}
		}
		#print STDERR "Found a total of $cont_pmid different PMIDs\n";
	}
	close OUT;
	if($ana){
		print STDERR "Please check $output\n";
	}
}

sub help{
        my $usage = qq{
          $0 

            Getting help:
                [--help]
           
			Optional parameters;
              [ana] : print results in a file
			   
            Examples:
              perl $0 -ana
                       
 };

print STDERR $usage;
exit();
        
}

sub date{
        my $date=("date \"+%D %H:%M:%S\"");
        $date=`$date`;
        chomp $date;
        return("[".$date."]");
}	
sub get_PTMs_from_db{
	my $sth = $dbh->prepare("select * from PTMs p, residues r where p.id_ptm=r.id_ptm") or die "Can't prepare query: $dbh->errstr\n";
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	my $already_saved;
	while(my @row = $sth->fetchrow_array) {
		$already_saved->{"$row[1]\t$row[2]\t$row[3]\t$row[5]"}=$row[0];
	}
	return($already_saved);
}
sub get_partial_PTMs_from_db{
	#not taking into account the modifier
	my $sth = $dbh->prepare("select * from PTMs") or die "Can't prepare query: $dbh->errstr\n";
	$sth->execute or die "can't execute the query: $sth->errstr\n";
	my $already_saved;
	while(my @row = $sth->fetchrow_array) {
		$already_saved->{"$row[1]\t$row[3]"}=$row[0];
	}
	return($already_saved);
}