hansoffate
October 1st, 2008, 07:33 PM
Hello all,
I am having a few issues with my script. I started out writing this script without "use strict", however, everyone suggested that I should, so I started converting everything over. Now that I added use strict and the my in their respective places, my script won't run. Here is my error.
hansoffate@grunt:~/Documents/Steve/Project/SGD_datapull$ perl projectV3.pl
syntax error at projectV3.pl line 32, near "$feat_type{"
syntax error at projectV3.pl line 33, near "$feat_qual{"
Execution of projectV3.pl aborted due to compilation errors.
I have a feeling this is an easy problem that I am just overlooking.
Alright, now down to the real problem. So far in the script, there has only been one entry per SGDID. However, in the go_slim_mapping.tab file (located here (ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/go_slim_mapping.tab)), the Molecular Function, Biological Process, and Cellular Component are stored as multiple line entries. The 4th field contains a 1 letter code F, P, or C, which represents the titles respectively. The 5th field actually contains the description that I want to put in my CSV file I am trying to create.
Example SGDID: S000004664
There are 4 "C" descriptions, 1 "F", and 3 "P"s. I was thinking of trying to join the descriptions together with a pipe and then putting those respective values in the CSV file. I'll readup on how to do this, but if anyone have any comments or suggestions, please post!
Thank you for the help,
Hans
#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
open IDS, "<fullsgdids.txt";
chomp (my @ids = <IDS> );
close(IDS);
##Tilda Delimitted File
open (MYFILE, '>data.csv');
print MYFILE "SGDID~ORF~Standard_Name~Alias~Description~Name_Des cription~Molecular_Function~Biological_Process~Cel lular_Component~Define~Mutant_Phenotype\n";
open (SGDFEAT, "SGD_features.tab") || die "File not found\n";
chomp (my @sgdfeats=<SGDFEAT>);
close (SGDFEAT);
#open (GENEASSOC, "gene_association.sgd") || die "File not found\n";
#chomp (my @gene=<GENEASSOC>);
#close (GENEASSOC);
open (SLIMMAP, "go_slim_mapping.tab") || die "File not found\n";
chomp (my @slim=<SLIMMAP>);
close (SLIMMAP);
##List of columns $sgdid, $feat_type, $feat_qual, $feat_name, $stnd_name, $alias,
##$parent, $sec_sgdid, $chrom, $start_coord, $stop_coord, $strand, $genetic_pos,
##$coord_ver, $seq_vers, $desc
foreach my $i (@sgdfeats) {
my ($sgdid, $feat_type, $feat_qual, $feat_name, $stnd_name, $alias, $parent, $sec_sgdid, $chrom, $start_coord, $stop_coord, $strand, $genetic_pos, $coord_ver, $seq_vers, $desc) = split(/\t/, $i);
my $feat_type{$sgdid} = $feat_type;
my $feat_qual{$sgdid} = $feat_qual;
my $feat_name{$sgdid} = $feat_name;
my $stnd_name{$sgdid} = $stnd_name;
my $desc{$sgdid} = $desc;
my $alias{$sgdid} = $alias;
my $parent{$sgdid} = $parent;
my $sec_sgdid{$sgdid} = $sec_sgdid;
my $chrom{$sgdid} = $chrom;
my $start_coord{$sgdid} = $start_coord;
my $stop_coord{$sgdid} = $stop_coord;
my $strand{$sgdid} = $strand;
my $genetic_pos{$sgdid} = $genetic_pos;
my $coord_ver{$sgdid} = $coord_ver;
my $seq_vers{$sgdid} = $seq_vers;
}
##List of Columns for gene_associations: $db, $db_obj_id, $db_obj_symb, $not
##$goid, $db_ref, $evid, $with, $aspect, $db_obj_name, $db_obj_synonym,
##$db_obj_type, $taxon, $date, $assgn_by
#foreach my $o (@gene) {
# my ($db, $sgdid, $db_obj_symb, $not, $goid, $db_ref, $evid, $with, $aspect, $db_obj_name, $db_obj_synonym, $db_obj_type, $taxon, $date, # $assgn_by) = split(/\t/, $o);
# my $db{$sgdid} = $db;
# my $db_obj_symb{$sgdid} = $db_obj_symb;
# my $not{$sgdid} = $not;
# my $goid{$sgdid} = $goid;
# my $db_ref{$sgdid} = $db_ref;
# my $evid{$sgdid} = $evid;
# my $with{$sgdid} = $with;
# my $aspect{$sgdid} = $aspect;
# my $db_obj_name{$sgdid} = $db_obj_name;
# my $db_obj_synonym{$sgdid} = $db_obj_synonym;
# my $db_obj_type{$sgdid} = $db_obj_type;
# my $taxon{$sgdid} = $taxon;
# my $date{$sgdid} = $date;
# my $assgn_by{$sgdid} = $assgn_by;
#}
##List of Columns for go_slim: $orf, $gene, $sgdid, $go_aspect, $go_slim, $goid, $feature_type
foreach my $p (@slim) {
my ($orf, $gene, $sgdid, $go_aspect, $go_slim, $goid, $feature_type) = split(/\t/, $p);
my $orf{$sgdid} = $orf;
my $gene{$sgdid} = $gene;
my $go_aspect{$sgdid} = $go_aspect;
my $go_slim{$sgdid} = $go_slim;
my $goid{$sgdid} = $goid;
my $feature_type{$sgdid} = $feature_type;
}
foreach my $ids (@ids) {
print MYFILE "$ids~$feat_name{$ids}~$stnd_name{$ids}~$alias{$ids }~$desc{$ids}~$go_aspect{$ids}~$go_slim{$ids}\n"
}
I am having a few issues with my script. I started out writing this script without "use strict", however, everyone suggested that I should, so I started converting everything over. Now that I added use strict and the my in their respective places, my script won't run. Here is my error.
hansoffate@grunt:~/Documents/Steve/Project/SGD_datapull$ perl projectV3.pl
syntax error at projectV3.pl line 32, near "$feat_type{"
syntax error at projectV3.pl line 33, near "$feat_qual{"
Execution of projectV3.pl aborted due to compilation errors.
I have a feeling this is an easy problem that I am just overlooking.
Alright, now down to the real problem. So far in the script, there has only been one entry per SGDID. However, in the go_slim_mapping.tab file (located here (ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation/go_slim_mapping.tab)), the Molecular Function, Biological Process, and Cellular Component are stored as multiple line entries. The 4th field contains a 1 letter code F, P, or C, which represents the titles respectively. The 5th field actually contains the description that I want to put in my CSV file I am trying to create.
Example SGDID: S000004664
There are 4 "C" descriptions, 1 "F", and 3 "P"s. I was thinking of trying to join the descriptions together with a pipe and then putting those respective values in the CSV file. I'll readup on how to do this, but if anyone have any comments or suggestions, please post!
Thank you for the help,
Hans
#!/usr/bin/perl
use strict;
use warnings;
use LWP::Simple;
open IDS, "<fullsgdids.txt";
chomp (my @ids = <IDS> );
close(IDS);
##Tilda Delimitted File
open (MYFILE, '>data.csv');
print MYFILE "SGDID~ORF~Standard_Name~Alias~Description~Name_Des cription~Molecular_Function~Biological_Process~Cel lular_Component~Define~Mutant_Phenotype\n";
open (SGDFEAT, "SGD_features.tab") || die "File not found\n";
chomp (my @sgdfeats=<SGDFEAT>);
close (SGDFEAT);
#open (GENEASSOC, "gene_association.sgd") || die "File not found\n";
#chomp (my @gene=<GENEASSOC>);
#close (GENEASSOC);
open (SLIMMAP, "go_slim_mapping.tab") || die "File not found\n";
chomp (my @slim=<SLIMMAP>);
close (SLIMMAP);
##List of columns $sgdid, $feat_type, $feat_qual, $feat_name, $stnd_name, $alias,
##$parent, $sec_sgdid, $chrom, $start_coord, $stop_coord, $strand, $genetic_pos,
##$coord_ver, $seq_vers, $desc
foreach my $i (@sgdfeats) {
my ($sgdid, $feat_type, $feat_qual, $feat_name, $stnd_name, $alias, $parent, $sec_sgdid, $chrom, $start_coord, $stop_coord, $strand, $genetic_pos, $coord_ver, $seq_vers, $desc) = split(/\t/, $i);
my $feat_type{$sgdid} = $feat_type;
my $feat_qual{$sgdid} = $feat_qual;
my $feat_name{$sgdid} = $feat_name;
my $stnd_name{$sgdid} = $stnd_name;
my $desc{$sgdid} = $desc;
my $alias{$sgdid} = $alias;
my $parent{$sgdid} = $parent;
my $sec_sgdid{$sgdid} = $sec_sgdid;
my $chrom{$sgdid} = $chrom;
my $start_coord{$sgdid} = $start_coord;
my $stop_coord{$sgdid} = $stop_coord;
my $strand{$sgdid} = $strand;
my $genetic_pos{$sgdid} = $genetic_pos;
my $coord_ver{$sgdid} = $coord_ver;
my $seq_vers{$sgdid} = $seq_vers;
}
##List of Columns for gene_associations: $db, $db_obj_id, $db_obj_symb, $not
##$goid, $db_ref, $evid, $with, $aspect, $db_obj_name, $db_obj_synonym,
##$db_obj_type, $taxon, $date, $assgn_by
#foreach my $o (@gene) {
# my ($db, $sgdid, $db_obj_symb, $not, $goid, $db_ref, $evid, $with, $aspect, $db_obj_name, $db_obj_synonym, $db_obj_type, $taxon, $date, # $assgn_by) = split(/\t/, $o);
# my $db{$sgdid} = $db;
# my $db_obj_symb{$sgdid} = $db_obj_symb;
# my $not{$sgdid} = $not;
# my $goid{$sgdid} = $goid;
# my $db_ref{$sgdid} = $db_ref;
# my $evid{$sgdid} = $evid;
# my $with{$sgdid} = $with;
# my $aspect{$sgdid} = $aspect;
# my $db_obj_name{$sgdid} = $db_obj_name;
# my $db_obj_synonym{$sgdid} = $db_obj_synonym;
# my $db_obj_type{$sgdid} = $db_obj_type;
# my $taxon{$sgdid} = $taxon;
# my $date{$sgdid} = $date;
# my $assgn_by{$sgdid} = $assgn_by;
#}
##List of Columns for go_slim: $orf, $gene, $sgdid, $go_aspect, $go_slim, $goid, $feature_type
foreach my $p (@slim) {
my ($orf, $gene, $sgdid, $go_aspect, $go_slim, $goid, $feature_type) = split(/\t/, $p);
my $orf{$sgdid} = $orf;
my $gene{$sgdid} = $gene;
my $go_aspect{$sgdid} = $go_aspect;
my $go_slim{$sgdid} = $go_slim;
my $goid{$sgdid} = $goid;
my $feature_type{$sgdid} = $feature_type;
}
foreach my $ids (@ids) {
print MYFILE "$ids~$feat_name{$ids}~$stnd_name{$ids}~$alias{$ids }~$desc{$ids}~$go_aspect{$ids}~$go_slim{$ids}\n"
}