#!/usr/bin/env perl

use perl5i::2;
use Moose;

use Bio::Chado::Schema;

my ($database, $host, $username, $password) = @ARGV;

my $schema = Bio::Chado::Schema->connect("dbi:Pg:database=$database;host=$host",
                                       $username, $password,
                                       { auto_savepoint => 1 });

my $pombe = $schema->resultset('Organism::Organism')->find({ abbreviation => 'Spombe' });
my $gene_cvterm = $schema->resultset('Cv::Cvterm')->find({ name => 'gene' });

my $one_to_one_term = $schema->resultset('Cv::Cvterm')->find({ name => 'predominantly single copy (one to one)' });
my $one_to_one_rs = $schema->resultset('Sequence::FeatureCvterm')->search({ cvterm_id => $one_to_one_term->cvterm_id() });

my $vertebrates_term = $schema->resultset('Cv::Cvterm')->find({ name => 'conserved in vertebrates' });
my $vertebrates_rs = $schema->resultset('Sequence::FeatureCvterm')->search({ cvterm_id => $vertebrates_term->cvterm_id() });

my $pombe_genes_rs =
  $schema->resultset('Sequence::Feature')->search({ organism_id => $pombe->organism_id(),
                                                    type_id => $gene_cvterm->cvterm_id(),
                                                    feature_id => [
                                                      -and =>
                                                      { -in => $one_to_one_rs->get_column('feature_id')->as_query() },
                                                      { -in => $vertebrates_rs->get_column('feature_id')->as_query() },
                                                    ]
                                                  });

my %pombe_genes = ();

while (defined (my $gene = $pombe_genes_rs->next())) {
  $pombe_genes{$gene->uniquename()} = $gene;
}

warn 'pombe gene count: ', $pombe_genes_rs->count(), "\n";


my $human = $schema->resultset('Organism::Organism')->find({ abbreviation => 'human' });

my $human_genes_rs =
  $schema->resultset('Sequence::Feature')->search({ organism_id => $human->organism_id(),
                                                    type_id => $gene_cvterm->cvterm_id(),
                                                  });

my %human_genes = ();

while (defined (my $gene = $human_genes_rs->next())) {
  $human_genes{$gene->uniquename()} = $gene;
}

warn 'human gene count: ', $human_genes_rs->count(), "\n";


my $ortholog_cvterm =
  $schema->resultset('Cv::Cvterm')->find({ name => 'orthologous_to' });

my $orth_rs =
  $schema->resultset('Sequence::FeatureRelationship')->search({
    'me.type_id' => $ortholog_cvterm->cvterm_id(),
    subject_id => {
      -in => $human_genes_rs->get_column('feature_id')->as_query(),
    },
    object_id => {
      -in => $pombe_genes_rs->get_column('feature_id')->as_query(),
    },
  },
  {
    prefetch => [
      'subject', 'object'
    ],
  });

warn 'orthologs from chado: ', $orth_rs->count(), "\n";

my %chado_pombe_to_human = ();
my %chado_human_to_pombe = ();
my %chado_orthologs = ();

while (defined (my $orth = $orth_rs->next())) {
  my $pombe_identifier = $orth->object()->uniquename();
  my $human_identifier = $orth->subject()->uniquename();

  push @{$chado_pombe_to_human{$pombe_identifier}}, $human_identifier;
  push @{$chado_human_to_pombe{$human_identifier}}, $pombe_identifier;

  my $key = "$pombe_identifier\t$human_identifier";
  $chado_orthologs{$key} = 1;
}

open my $orth_file, '<', 'pombe_human.txt' or die;

my %compara_pombe_to_human = ();
my %compara_human_to_pombe = ();
my %compara_orthologs = ();

my $all_compara_ortholog_count = 0;

while (my $line = <$orth_file>) {
  next if $line =~ /stable_id/;
  my ($pombe_identifier, $human_identifier) = split /\s+/, $line;

  $all_compara_ortholog_count++;

  if (!exists $pombe_genes{$pombe_identifier}) {
    next;
  }

  push @{$compara_pombe_to_human{$pombe_identifier}}, $human_identifier;
  push @{$compara_human_to_pombe{$human_identifier}}, $pombe_identifier;

  my $key = "$pombe_identifier\t$human_identifier";
  $compara_orthologs{$key} = 1;
}

warn "orthologs from compara: $all_compara_ortholog_count\n";

my %kept_compara_orthologs = ();

POMBE: while (my ($pombe_identifier, $values) = each %compara_pombe_to_human) {
  my @human_orthologs = @$values;
  if (@human_orthologs <= 2) {
    for my $human_ortholog (@human_orthologs) {
      if (@{$compara_human_to_pombe{$human_ortholog}} > 1) {
        next POMBE;
      } else {
        my $key = "$pombe_identifier\t$human_ortholog";
        $kept_compara_orthologs{$key} = 1
      }
    }
  }
}

warn "compara orthologs after filter: ", scalar(keys %kept_compara_orthologs), "\n";


use Set::Scalar;

my $chado_orth_set = Set::Scalar->new(keys %chado_orthologs);
my $compara_orth_set = Set::Scalar->new(keys %kept_compara_orthologs);

my $intersect = $chado_orth_set * $compara_orth_set;

warn 'in chado and compara: ', $intersect->size(), "\n";

my $chado_only_set = $chado_orth_set - $compara_orth_set;
warn "chado only orthologs: ", $chado_only_set->size(), "\n";

my $compara_only_set = $compara_orth_set - $chado_orth_set;
warn "compara only orthologs: ", $compara_only_set->size(), "\n";


my %compara_orths_for_export = ();


# report pombe gene that have a different ortholog in Chado and Compara

for my $pombe_identifier (keys %pombe_genes) {
  my $chado_orth_set = Set::Scalar->new(@{$chado_pombe_to_human{$pombe_identifier}});
  my $compara_orth_set = Set::Scalar->new(@{$compara_pombe_to_human{$pombe_identifier}});

  next if $chado_orth_set->is_empty() && $compara_orth_set->is_empty();

  my $quit = 0;

  if ($chado_orth_set->is_empty()) {
    warn "$pombe_identifier has no ortholog in chado but does in compara\n";
    $compara_orths_for_export{$pombe_identifier} = [$compara_orth_set->elements()];
    $quit = 1;
  }

  if ($compara_orth_set->is_empty()) {
    warn "$pombe_identifier has no ortholog in compara but does in chado\n";
    $quit = 1;
  }

  next if $quit;

  if ($chado_orth_set == $compara_orth_set) {
    warn "chado and compara have the same orthologs for $pombe_identifier\n";
  } else {
    my $diff = "chado: $chado_orth_set / compara: $compara_orth_set";
    print STDERR "chado and compara have different orthologs for $pombe_identifier: $diff";
    if ($compara_orth_set->is_superset($chado_orth_set)) {
      warn "  (compara has extra)\n";
      my @extra_orths = $compara_orth_set->difference($chado_orth_set)->elements();
      $compara_orths_for_export{$pombe_identifier} = [@extra_orths];
    } else {
      warn "\n$pombe_identifier: $diff\n";
    }
  }
}

warn "compara orthologs to load:\n";

while (my ($pombe_identifier, $compara_orths) = each %compara_orths_for_export) {
  print "$pombe_identifier\t", join (',', @$compara_orths), "\n";
}