go_notes

Defindit Docs and Howto Home

This page last modified: Jul 06 2007
Feb 15 2007

x Loading the GO data into postgres
cat *.seq > go_schema.sql
emacs remove ` or remove ' ?
my2pg.pl go_schema.sql > go_schema.pgsql
\i go_schema.pgsql
emacs make loading script (or write a little Perl thing to create it
along with some print statments suitable to \i in pgsql).

x create index t4_term_index on term (is_obsolete,is_root);
Make is_obsolete and is_root constrained searches work at a reasonable speed.


x Create new assoc_seq_term without duplicate records
Took 15 seconds.
Older version took around 1 minute (before adding is_obsolete and is_root criteria), 

create table assoc_seq_term as
select term.id, term.name,term.acc,term.term_type,display_id
from
term,association,gene_product_seq,seq where
gene_product_seq.seq_id=seq.id and
association.gene_product_id=gene_product_seq.gene_product_id and
association.term_id=term.id and 
is_obsolete=0 and
is_root=0
group by term.id, term.name, term.acc, term.term_type, display_id;

gene_ontology=> select count(*) from assoc_seq_term;
 count
--------
 330538
(1 row)

x Index assoc_seq_term. Took 5 or 10 seconds to index
Speed up is from 250 ms down to 0.250 ms
create index ast_disply_id on assoc_seq_term (display_id);

x Get a report from the new _go table.

select * from
(select t3_term_type, t3_name, count(t3_name) from cowpea_accession_go_go where
t3_term_type = 'molecular_function' group by t3_term_type,t3_name) as a
order by a.count;

select * from
(select t3_term_type, t3_name, count(t3_name) from cowpea_accession_go_go where
t3_term_type = 'cellular_component' group by t3_term_type,t3_name) as a
order by a.count;

select * from
(select t3_term_type, t3_name, count(t3_name) from cowpea_accession_go_go where
t3_term_type = 'biological_process' group by t3_term_type,t3_name) as a
order by a.count;



-- Begin:
-- Look for a single protein. This older method does not remove duplicates.
select id,display_id,description from seq where display_id ='NP_195810';
  id   | display_id |                     description
 78267 | NP_195810  | WRKY62; transcription factor [Arabidopsis thaliana].

-- works. makes sense.
-- mole_func GO:0003700 biol_proc GO:0006355
-- 78267 | NP_195810 a WRKY protein
-- 103423 | AT5G01900.1 |    241820 | WRKY DNA-binding protein 62
select id,symbol,dbxref_id,full_name from gene_product,gene_product_seq
where
gene_product.id=gene_product_seq.gene_product_id and
gene_product_seq.seq_id=78267;

--    id   |   symbol    | dbxref_id | species_id | seconda... | type_id |          full_name
--  103423 | AT5G01900.1 |    241820 |     230763 |            |   23572 | WRKY DNA-binding 
select gene_product.* from gene_product,gene_product_seq
where
gene_product.id=gene_product_seq.gene_product_id and
gene_product_seq.seq_id=78267;

--works
select * from association where gene_product_id=103423;
   id   | term_id | gene_product_id | is_not | role_group | assocdate | source_db_id
 389039 |    1984 |          103423 |      0 |            |  20030606 |           24
 427196 |    1984 |          103423 |      0 |            |  20030417 |           20
 433987 |    1984 |          103423 |      0 |            |  20020522 |           20
 448942 |    4330 |          103423 |      0 |            |  20020522 |           20

-- works 
select * from term where id in (1984,4330);
  id  |                    name         |     term_type      |    acc     | is_obsolete | is_root
 1984 | transcription factor activity   | molecular_function | GO:0003700 |           0 |       0
 4330 | regulation of transcription, DNA| biological_process | GO:0006355 |           0 |       0
(2 rows)