<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">#!/usr/bin/env perl

use perl5i::2;
use Moose;

use Bio::Chado::Schema;

my ($database, $host, $username, $password) = @ARGV;

my $schema = Bio::Chado::Schema-&gt;connect("dbi:Pg:database=$database;host=$host",
                                       $username, $password,
                                       { auto_savepoint =&gt; 1 });

my $pombe = $schema-&gt;resultset('Organism::Organism')-&gt;find({ abbreviation =&gt; 'Spombe' });
my $gene_cvterm = $schema-&gt;resultset('Cv::Cvterm')-&gt;find({ name =&gt; 'gene' });

my $one_to_one_term = $schema-&gt;resultset('Cv::Cvterm')-&gt;find({ name =&gt; 'predominantly single copy (one to one)' });
my $one_to_one_rs = $schema-&gt;resultset('Sequence::FeatureCvterm')-&gt;search({ cvterm_id =&gt; $one_to_one_term-&gt;cvterm_id() });

my $vertebrates_term = $schema-&gt;resultset('Cv::Cvterm')-&gt;find({ name =&gt; 'conserved in vertebrates' });
my $vertebrates_rs = $schema-&gt;resultset('Sequence::FeatureCvterm')-&gt;search({ cvterm_id =&gt; $vertebrates_term-&gt;cvterm_id() });

my $pombe_genes_rs =
  $schema-&gt;resultset('Sequence::Feature')-&gt;search({ organism_id =&gt; $pombe-&gt;organism_id(),
                                                    type_id =&gt; $gene_cvterm-&gt;cvterm_id(),
                                                    feature_id =&gt; [
                                                      -and =&gt;
                                                      { -in =&gt; $one_to_one_rs-&gt;get_column('feature_id')-&gt;as_query() },
                                                      { -in =&gt; $vertebrates_rs-&gt;get_column('feature_id')-&gt;as_query() },
                                                    ]
                                                  });

my %pombe_genes = ();

while (defined (my $gene = $pombe_genes_rs-&gt;next())) {
  $pombe_genes{$gene-&gt;uniquename()} = $gene;
}

warn 'pombe gene count: ', $pombe_genes_rs-&gt;count(), "\n";


my $human = $schema-&gt;resultset('Organism::Organism')-&gt;find({ abbreviation =&gt; 'human' });

my $human_genes_rs =
  $schema-&gt;resultset('Sequence::Feature')-&gt;search({ organism_id =&gt; $human-&gt;organism_id(),
                                                    type_id =&gt; $gene_cvterm-&gt;cvterm_id(),
                                                  });

my %human_genes = ();

while (defined (my $gene = $human_genes_rs-&gt;next())) {
  $human_genes{$gene-&gt;uniquename()} = $gene;
}

warn 'human gene count: ', $human_genes_rs-&gt;count(), "\n";


my $ortholog_cvterm =
  $schema-&gt;resultset('Cv::Cvterm')-&gt;find({ name =&gt; 'orthologous_to' });

my $orth_rs =
  $schema-&gt;resultset('Sequence::FeatureRelationship')-&gt;search({
    'me.type_id' =&gt; $ortholog_cvterm-&gt;cvterm_id(),
    subject_id =&gt; {
      -in =&gt; $human_genes_rs-&gt;get_column('feature_id')-&gt;as_query(),
    },
    object_id =&gt; {
      -in =&gt; $pombe_genes_rs-&gt;get_column('feature_id')-&gt;as_query(),
    },
  },
  {
    prefetch =&gt; [
      'subject', 'object'
    ],
  });

warn 'orthologs from chado: ', $orth_rs-&gt;count(), "\n";

my %chado_pombe_to_human = ();
my %chado_human_to_pombe = ();
my %chado_orthologs = ();

while (defined (my $orth = $orth_rs-&gt;next())) {
  my $pombe_identifier = $orth-&gt;object()-&gt;uniquename();
  my $human_identifier = $orth-&gt;subject()-&gt;uniquename();

  push @{$chado_pombe_to_human{$pombe_identifier}}, $human_identifier;
  push @{$chado_human_to_pombe{$human_identifier}}, $pombe_identifier;

  my $key = "$pombe_identifier\t$human_identifier";
  $chado_orthologs{$key} = 1;
}

open my $orth_file, '&lt;', 'pombe_human.txt' or die;

my %compara_pombe_to_human = ();
my %compara_human_to_pombe = ();
my %compara_orthologs = ();

my $all_compara_ortholog_count = 0;

while (my $line = &lt;$orth_file&gt;) {
  next if $line =~ /stable_id/;
  my ($pombe_identifier, $human_identifier) = split /\s+/, $line;

  $all_compara_ortholog_count++;

  if (!exists $pombe_genes{$pombe_identifier}) {
    next;
  }

  push @{$compara_pombe_to_human{$pombe_identifier}}, $human_identifier;
  push @{$compara_human_to_pombe{$human_identifier}}, $pombe_identifier;

  my $key = "$pombe_identifier\t$human_identifier";
  $compara_orthologs{$key} = 1;
}

warn "orthologs from compara: $all_compara_ortholog_count\n";

my %kept_compara_orthologs = ();

POMBE: while (my ($pombe_identifier, $values) = each %compara_pombe_to_human) {
  my @human_orthologs = @$values;
  if (@human_orthologs &lt;= 2) {
    for my $human_ortholog (@human_orthologs) {
      if (@{$compara_human_to_pombe{$human_ortholog}} &gt; 1) {
        next POMBE;
      } else {
        my $key = "$pombe_identifier\t$human_ortholog";
        $kept_compara_orthologs{$key} = 1
      }
    }
  }
}

warn "compara orthologs after filter: ", scalar(keys %kept_compara_orthologs), "\n";



use Set::Scalar;

my $chado_orth_set = Set::Scalar-&gt;new(keys %chado_orthologs);
my $compara_orth_set = Set::Scalar-&gt;new(keys %kept_compara_orthologs);

my $intersect = $chado_orth_set * $compara_orth_set;

warn 'in chado and compara: ', $intersect-&gt;size(), "\n";

my $chado_only_set = $chado_orth_set - $compara_orth_set;
warn "chado only orthologs: ", $chado_only_set-&gt;size(), "\n";

my $compara_only_set = $compara_orth_set - $chado_orth_set;
warn "compara only orthologs: ", $compara_only_set-&gt;size(), "\n";



my %compara_orths_for_export = ();


# report pombe gene that have a different ortholog in Chado and Compara

for my $pombe_identifier (keys %pombe_genes) {
  my $chado_orth_set = Set::Scalar-&gt;new(@{$chado_pombe_to_human{$pombe_identifier}});
  my $compara_orth_set = Set::Scalar-&gt;new(@{$compara_pombe_to_human{$pombe_identifier}});

  next if $chado_orth_set-&gt;is_empty() &amp;&amp; $compara_orth_set-&gt;is_empty();

  my $quit = 0;

  if ($chado_orth_set-&gt;is_empty()) {
    warn "$pombe_identifier has no ortholog in chado but does in compara\n";
    $compara_orths_for_export{$pombe_identifier} = [$compara_orth_set-&gt;elements()];
    $quit = 1;
  }

  if ($compara_orth_set-&gt;is_empty()) {
    warn "$pombe_identifier has no ortholog in compara but does in chado\n";
    $quit = 1;
  }

  next if $quit;

  if ($chado_orth_set == $compara_orth_set) {
    warn "chado and compara have the same orthologs for $pombe_identifier\n";
  } else {
    my $diff = "chado: $chado_orth_set / compara: $compara_orth_set";
    print STDERR "chado and compara have different orthologs for $pombe_identifier: $diff";
    if ($compara_orth_set-&gt;is_superset($chado_orth_set)) {
      warn "  (compara has extra)\n";
      my @extra_orths = $compara_orth_set-&gt;difference($chado_orth_set)-&gt;elements();
      $compara_orths_for_export{$pombe_identifier} = [@extra_orths];
    } else {
      warn "\n$pombe_identifier: $diff\n";
    }
  }
}

warn "compara orthologs to load:\n";

while (my ($pombe_identifier, $compara_orths) = each %compara_orths_for_export) {
  print "$pombe_identifier\t", join (',', @$compara_orths), "\n";
}
</pre></body></html>