SPARQL example query
18: Select the number of UniProtKB entries for each of the EC (Enzyme Commission) top level categories
PREFIX ec: <http://purl.uniprot.org/enzyme/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?ecClass (COUNT(?protein) as ?size)
WHERE
{
VALUES (?ecClass) {(ec:1.-.-.-) (ec:2.-.-.-) (ec:3.-.-.-) (ec:4.-.-.-) (ec:5.-.-.-) (ec:6.-.-.-) (ec:7.-.-.-)} .
?protein ( up:enzyme | up:domain/up:enzyme | up:component/up:enzyme ) ?enzyme .
# Enzyme subclasses are materialized, do not use rdfs:subClassOf+
?enzyme rdfs:subClassOf ?ecClass .
}
GROUP BY ?ecClass ORDER BY ?ecClass
19: Find all natural variant annotations if associated via an evidence tag to an article with a PubMed identifier
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?accession
?annotation_acc
?pubmed
WHERE
{
?protein a up:Protein ;
up:annotation ?annotation .
?annotation a up:Natural_Variant_Annotation .
?linkToEvidence rdf:object ?annotation ;
up:attribution ?attribution .
?attribution up:source ?source .
?source a up:Journal_Citation .
BIND(SUBSTR(STR(?protein),33) AS ?accession)
BIND(IF(CONTAINS(STR(?annotation), "#SIP"), SUBSTR(STR(?annotation),33), SUBSTR(STR(?annotation),36))AS?annotation_acc)
BIND(SUBSTR(STR(?source),35) AS ?pubmed)
}
20: Find how often an article in PubMed was used in an evidence tag in a human protein (ordered by most used to least)
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?source
(COUNT(?attribution) AS ?attribitions)
WHERE
{
?protein a up:Protein ;
up:organism taxon:9606 ;
up:annotation ?annotation .
?linkToEvidence rdf:object ?annotation ;
up:attribution ?attribution .
?attribution up:source ?source .
?source a up:Journal_Citation .
} GROUP BY ?source ORDER BY DESC(COUNT(?attribution))
21: Find where disease related proteins are known to be located in the cell
xxxxxxxxxx
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?disease
?location_inside_cell
?cellcmpt
WHERE
{
?protein up:annotation ?diseaseAnnotation , ?subcellAnnotation .
?diseaseAnnotation up:disease/skos:prefLabel ?disease .
?subcellAnnotation up:locatedIn/up:cellularComponent ?cellcmpt .
?cellcmpt skos:prefLabel ?location_inside_cell .
}
22: For two accession numbers (ACs) find the GO term labels and group them into GO process,function and component
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX GO:<http://purl.obolibrary.org/obo/GO_>
SELECT
(CONCAT(SUBSTR(STR(?protein), 33)) AS ?uniprot)
(GROUP_CONCAT(?celtype; separator=";") AS ?celtypes)
(GROUP_CONCAT(?biotype; separator=";") AS ?biotypes)
(GROUP_CONCAT(?moltype; separator=";") AS ?moltypes)
WHERE
{
VALUES (?ac) {("Q6GZX4") ("Q96375")}
BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/",?ac)) AS ?protein)
?protein a up:Protein .
?protein up:classifiedWith ?goTerm .
#Determine if the type is biological_process
OPTIONAL {
?goTerm rdfs:subClassOf GO:0008150 .
?goTerm rdfs:label ?biotype .
}
#Determine if the type is cellular_component
OPTIONAL {
?goTerm rdfs:subClassOf GO:0005575 .
?goTerm rdfs:label ?celtype .
}
#Determine if the type is molecular_function
OPTIONAL {
?goTerm rdfs:subClassOf GO:0003674 .
?goTerm rdfs:label ?moltype .
}
#Filter out the uniprot keywords
FILTER(bound(?biotype) || bound(?celtype) || bound(?moltype))
} GROUP BY ?protein
23: Number of reviewed entries (UniProtKB/Swiss-Prot) that are related to kinase activity
xxxxxxxxxx
PREFIX GO: <http://purl.obolibrary.org/obo/GO_>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
(COUNT(DISTINCT(?protein)) AS ?pc)
WHERE
{
?protein rdf:type up:Protein ;
up:reviewed true ;
up:organism taxon:9606 ;
up:classifiedWith|(up:classifiedWith/rdfs:subClassOf) GO:0016301 .
}
24: Find the release number of the UniProt data that is currently being queried
xxxxxxxxxx
SELECT ?version
FROM <https://sparql.uniprot.org/.well-known/void>
WHERE
{
[] <http://purl.org/pav/version> ?version
}
25: Find UniProtKB entry which has a protein name 'HLA class I histocompatibility antigen, B-73 alpha chain'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName
WHERE
{
?protein a up:Protein .
?protein (up:recommendedName|up:alternativeName) ?structuredName .
?structuredName ?anyKindOfName "HLA class I histocompatibility antigen, B alpha chain" .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
26: Find UniProtKB proteins which are cleaved and contain a domain or component named 'HLA class I histocompatibility antigen, B-73 alpha chain'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName
WHERE
{
?protein a up:Protein .
?protein (up:recommendedName|up:alternativeName)|((up:domain|up:component)/(up:recommendedName|up:alternativeName)) ?structuredName .
?structuredName ?anyKindOfName "HLA class I histocompatibility antigen, B-73 alpha chain" .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
26: Find UniProtKB entry, or an UniProtKB entries domain or component which has a name 'HLA class I histocompatibility antigen, B-73 alpha chain'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName
WHERE
{
?protein a up:Protein .
?protein (up:recommendedName|up:alternativeName)|((up:domain|up:component)/(up:recommendedName|up:alternativeName)) ?structuredName .
?structuredName ?anyKindOfName "HLA class I histocompatibility antigen, B-73 alpha chain" .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
27: Retrieve all protein names, including peptide names, associated with UniProtKB entry P05067.
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?anyKindOfName
?names
?partType
WHERE
{
BIND(<http://purl.uniprot.org/uniprot/P05067> AS ?protein)
?protein a up:Protein .
{
?protein (up:recommendedName|up:alternativeName) ?structuredName .
}
UNION
{
VALUES(?partType){(up:domain) (up:component)}
?protein ?partType ?part .
?part (up:recommendedName|up:alternativeName) ?structuredName .
}
?structuredName ?anyKindOfName ?names .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
27: Find all names associated with UniProtKB entry P05067, and if the name is associated with the entry it's domains or its components
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?anyKindOfName
?names
?partType
WHERE
{
BIND(<http://purl.uniprot.org/uniprot/P05067> AS ?protein)
?protein a up:Protein .
{
?protein (up:recommendedName|up:alternativeName) ?structuredName .
}
UNION
{
VALUES(?partType){(up:domain) (up:component)}
?protein ?partType ?part .
?part (up:recommendedName|up:alternativeName) ?structuredName .
}
?structuredName ?anyKindOfName ?names .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
28: Get the list of UniProtKB entries for the chromosome of proteome UP000000625
xxxxxxxxxx
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?proteome
WHERE
{
?protein a up:Protein ;
up:reviewed true ;
up:proteome ?proteome .
VALUES (?proteome) {(<http://purl.uniprot.org/proteomes/UP000000625#Chromosome>)}
}
29: Use ALLIE a service for Abbreviation / Long Form in Japanese and English to search in UniProt using Japanese.
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?englishLabelStr
WHERE {
SERVICE <https://data.allie.dbcls.jp/sparql>{
?x rdfs:label "アミロイド前駆体タンパク質" ;
rdfs:label ?englishLabel .
FILTER(lang(?englishLabel) = "en")
}
BIND (STR(?englishLabel) AS ?englishLabelStr)
?protein a up:Protein .
{
?protein (up:recommendedName|up:alternativeName) ?structuredName .
}
UNION
{
VALUES(?partType){(up:domain) (up:component)}
?protein ?partType ?part .
?part (up:recommendedName|up:alternativeName) ?structuredName .
}
?structuredName ?anyKindOfName ?englishLabelStr .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
30: Find UniProtKB entries with merged loci in Bordetella avium
xxxxxxxxxx
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
(GROUP_CONCAT(?locusName; separator=',') AS ?locusNames)
WHERE
{
?protein a up:Protein ;
up:organism taxon:360910 ;
up:encodedBy ?gene .
?gene up:locusName ?locusName .
}
GROUP BY ?protein
HAVING (COUNT(?locusName) > 1)
31: Find UniParc records whose sequence point to the most database entries
xxxxxxxxxx
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?sequence ?entries
WHERE
{
SELECT
?sequence
(COUNT(?entry) AS ?entries)
WHERE
{
GRAPH <http://sparql.uniprot.org/uniparc> {
?sequence up:sequenceFor ?entry .
}
} GROUP BY ?sequence
} ORDER BY DESC(?entries)
32: Find UniProtKB entries with more than 1 Topological domain annotation
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
(GROUP_CONCAT(?comment; separator=", ") AS ?comments)
WHERE
{
?protein a up:Protein ;
up:annotation ?annotation .
?annotation rdf:type up:Topological_Domain_Annotation ;
rdfs:comment ?comment .
}
GROUP BY ?protein
HAVING (COUNT(?annotation) > 1)
33: Find longest comment text associated with a UniProtKB natural variant annotation
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?annotation ?comment
WHERE {
?annotation a up:Natural_Variant_Annotation ;
rdfs:comment ?comment .
}
ORDER BY DESC(STRLEN(?comment))
34: Find the co-occurence count of topological domain comment text in UniProtKB entries
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?comment1
?comment2
(COUNT(?comment1) AS ?count1)
WHERE
{
?protein a up:Protein ;
up:annotation ?annotation1 ,
?annotation2 .
?annotation1 rdf:type up:Topological_Domain_Annotation ;
rdfs:comment ?rawComment1 .
?annotation2 rdf:type up:Topological_Domain_Annotation ;
rdfs:comment ?rawComment2 .
BIND(IF(contains(?rawComment1, ';'),
STRBEFORE(?rawComment1,';'),
?rawComment1) AS ?comment1)
BIND(IF(contains(?rawComment2, ';'),
STRBEFORE(?rawComment2,';'),
?rawComment2) AS ?comment2)
FILTER(?annotation1 != ?annotation2)
}
GROUP BY ?comment1 ?comment2
ORDER BY DESC(COUNT(?comment1))
35: Find the similar proteins for UniProtKB entry P05067 sorted by UniRef cluster identity
xxxxxxxxxx
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?similar ?identity
FROM <http://sparql.uniprot.org/uniref>
FROM <http://sparql.uniprot.org/uniprot>
WHERE
{
BIND (uniprotkb:P05607 AS ?protein)
?cluster up:member ?member ;
up:member/up:sequenceFor ?protein;
up:identity ?identity .
?member up:sequenceFor ?similar .
FILTER(!sameTerm(?similar, ?protein))
}
ORDER BY DESC(?identity)
36: Find the orthologous proteins for UniProtKB entry P05067 using the OrthoDB database
xxxxxxxxxx
PREFIX orthodb: <http://purl.orthodb.org/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?orthoGroup
?scientificName
?functionComment
?prefferedGeneName
((STRLEN(?value) - ?medianLength) as ?deviationFromMedianLength)
WHERE
{
uniprotkb:P05067 a up:Protein ;
up:organism/up:scientificName ?scientificName ;
rdfs:seeAlso ?orthoGroup ;
up:encodedBy/skos:prefLabel ?prefferedGeneName ;
up:sequence/rdf:value ?value .
OPTIONAL {
?protein up:annotation ?functionAnnotation .
?functionAnnotation a up:Function_Annotation ;
rdfs:comment ?functionComment .
}
SERVICE <https://sparql.orthodb.org/sparql>{
?orthoGroup orthodb:ogMedianProteinLength ?medianLength .
?orthoGroup orthodb:hasMember ?xref .
?xref orthodb:xref/orthodb:xrefResource uniprotkb:P05067 .
}
}
37: Find the human protein which contains an epitope VSTQ, where T is a phosphorylated threonine
xxxxxxxxxx
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?comment
?begin
?end
WHERE
{
?protein a up:Protein ;
up:organism taxon:9606 ;
up:sequence ?sequence ;
up:annotation ?annotation .
?annotation a up:Modified_Residue_Annotation ;
rdfs:comment ?comment ;
up:range ?range .
?range
faldo:begin [ faldo:position ?begin ; faldo:reference ?sequence ] ;
faldo:end [ faldo:position ?end ; faldo:reference ?sequence ] .
?sequence rdf:value ?aaSequence .
FILTER (SUBSTR(?aaSequence, ?begin -2 , 4) = "VSTQ")
FILTER (CONTAINS(?comment, "Phosphothreonine"))
}
38: For the human entry P05067 (Amyloid-beta precursor protein) find the gene start ends in WikiData
xxxxxxxxxx
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT
?protein
?begin
?end
?chromosome
?assembly
WHERE {
{
BIND(uniprotkb:P05067 AS ?proteinIRI)
BIND (SUBSTR(STR(?proteinIRI), STRLEN(STR(uniprotkb:))+1) AS ?protein)
}
SERVICE <https://query.wikidata.org/sparql> {
?wp wdt:P352 ?protein ;
wdt:P702 ?wg .
?wg p:P644 ?wgss .
?wgss ps:P644 ?begin ;
pq:P1057/wdt:P1813 ?chromosome ;
pq:P659/rdfs:label ?assembly .
?wg p:P645 ?wgse .
?wgse ps:P645 ?end ;
pq:P1057/wdt:P1813 ?chromosome ;
pq:P659/rdfs:label ?assembly .
FILTER(lang(?assembly) = "en")
}
}
39: Retrieve entries and catalytic activities in the reviewed (UniProtKB/Swiss-Prot) section that have experimental evidences,
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?rhea
WHERE {
# ECO 269 is experimental evidence
BIND (<http://purl.obolibrary.org/obo/ECO_0000269> as ?evidence)
GRAPH <http://sparql.uniprot.org/uniprot> {
?protein up:reviewed true ;
up:annotation ?a ;
up:attribution ?attribution .
?a a up:Catalytic_Activity_Annotation ;
up:catalyticActivity ?ca .
?ca up:catalyzedReaction ?rhea .
[] rdf:subject ?a ;
rdf:predicate up:catalyticActivity ;
rdf:object ?ca ;
up:attribution ?attribution .
?attribution up:evidence ?evidence .
}
}
40: Retrieve human enzymes that metabolize sphingolipids and are annotated in ChEMBL
xxxxxxxxxx
PREFIX CHEBI: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rh: <http://rdf.rhea-db.org/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT DISTINCT ?protein ?chemblEntry
WHERE {
SERVICE <https://sparql.rhea-db.org/sparql> {
?rhea rdfs:subClassOf rh:Reaction ;
rh:side/rh:contains/rh:compound/rh:chebi/rdfs:subClassOf+ CHEBI:26739 .
}
?ca up:catalyzedReaction ?rhea .
?protein up:annotation/up:catalyticActivity ?ca ;
up:organism taxon:9606 ;
rdfs:seeAlso ?chemblEntry .
?chemblEntry up:database <http://purl.uniprot.org/database/ChEMBL> .
}
41: Retrieve UniProtKB entries with sequences that are composed of fragments
xxxxxxxxxx
PREFIX up: <http://purl.uniprot.org/core/>
SELECT DISTINCT
?protein
WHERE {
?protein a up:Protein ;
up:sequence ?sequence .
MINUS { ?sequence up:fragment [] }
}