Defindit Docs and Howto Home
Feb 15 2007
x Loading the GO data into postgres
cat *.seq > go_schema.sql
emacs remove ` or remove ' ?
my2pg.pl go_schema.sql > go_schema.pgsql
\i go_schema.pgsql
emacs make loading script (or write a little Perl thing to create it
along with some print statments suitable to \i in pgsql).
x create index t4_term_index on term (is_obsolete,is_root);
Make is_obsolete and is_root constrained searches work at a reasonable speed.
x Create new assoc_seq_term without duplicate records
Took 15 seconds.
Older version took around 1 minute (before adding is_obsolete and is_root criteria),
create table assoc_seq_term as
select term.id, term.name,term.acc,term.term_type,display_id
from
term,association,gene_product_seq,seq where
gene_product_seq.seq_id=seq.id and
association.gene_product_id=gene_product_seq.gene_product_id and
association.term_id=term.id and
is_obsolete=0 and
is_root=0
group by term.id, term.name, term.acc, term.term_type, display_id;
gene_ontology=> select count(*) from assoc_seq_term;
count
--------
330538
(1 row)
x Index assoc_seq_term. Took 5 or 10 seconds to index
Speed up is from 250 ms down to 0.250 ms
create index ast_disply_id on assoc_seq_term (display_id);
x Get a report from the new _go table.
select * from
(select t3_term_type, t3_name, count(t3_name) from cowpea_accession_go_go where
t3_term_type = 'molecular_function' group by t3_term_type,t3_name) as a
order by a.count;
select * from
(select t3_term_type, t3_name, count(t3_name) from cowpea_accession_go_go where
t3_term_type = 'cellular_component' group by t3_term_type,t3_name) as a
order by a.count;
select * from
(select t3_term_type, t3_name, count(t3_name) from cowpea_accession_go_go where
t3_term_type = 'biological_process' group by t3_term_type,t3_name) as a
order by a.count;
-- Begin:
-- Look for a single protein. This older method does not remove duplicates.
select id,display_id,description from seq where display_id ='NP_195810';
id | display_id | description
78267 | NP_195810 | WRKY62; transcription factor [Arabidopsis thaliana].
-- works. makes sense.
-- mole_func GO:0003700 biol_proc GO:0006355
-- 78267 | NP_195810 a WRKY protein
-- 103423 | AT5G01900.1 | 241820 | WRKY DNA-binding protein 62
select id,symbol,dbxref_id,full_name from gene_product,gene_product_seq
where
gene_product.id=gene_product_seq.gene_product_id and
gene_product_seq.seq_id=78267;
-- id | symbol | dbxref_id | species_id | seconda... | type_id | full_name
-- 103423 | AT5G01900.1 | 241820 | 230763 | | 23572 | WRKY DNA-binding
select gene_product.* from gene_product,gene_product_seq
where
gene_product.id=gene_product_seq.gene_product_id and
gene_product_seq.seq_id=78267;
--works
select * from association where gene_product_id=103423;
id | term_id | gene_product_id | is_not | role_group | assocdate | source_db_id
389039 | 1984 | 103423 | 0 | | 20030606 | 24
427196 | 1984 | 103423 | 0 | | 20030417 | 20
433987 | 1984 | 103423 | 0 | | 20020522 | 20
448942 | 4330 | 103423 | 0 | | 20020522 | 20
-- works
select * from term where id in (1984,4330);
id | name | term_type | acc | is_obsolete | is_root
1984 | transcription factor activity | molecular_function | GO:0003700 | 0 | 0
4330 | regulation of transcription, DNA| biological_process | GO:0006355 | 0 | 0
(2 rows)