SPARQL example query
1: Select all taxa from the UniProt taxonomy
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?taxon
FROM <http://sparql.uniprot.org/taxonomy>
WHERE
{
?taxon a up:Taxon .
}
2: Select all bacterial taxa and their scientific name from the UniProt taxonomy
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?taxon ?name
WHERE
{
?taxon a up:Taxon .
?taxon up:scientificName ?name .
# Taxon subclasses are materialized, do not use rdfs:subClassOf+
?taxon rdfs:subClassOf taxon:2 .
}
3: Select all UniProtKB entries, and their organism and amino acid sequences (including isoforms), for E. coli K12 and all its strains
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?organism ?isoform ?sequence
WHERE
{
?protein a up:Protein .
?protein up:organism ?organism .
# Taxon subclasses are materialized, do not use rdfs:subClassOf+
?organism rdfs:subClassOf taxon:83333 .
?protein up:sequence ?isoform .
?isoform rdf:value ?sequence .
}
4: Select the UniProtKB entry with the mnemonic 'A4_HUMAN'
xxxxxxxxxx
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein
WHERE
{
?protein a up:Protein .
?protein up:mnemonic 'A4_HUMAN'
}
5: Select a mapping of UniProtKB to PDB entries using the UniProtKB cross-references to the PDB database
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?db
WHERE
{
?protein a up:Protein .
?protein rdfs:seeAlso ?db .
?db up:database <http://purl.uniprot.org/database/PDB>
}
6: Select all cross-references to external databases of the category '3D structure databases' of UniProtKB entries that are classified with the keyword 'Acetoin biosynthesis (KW-0005)'
xxxxxxxxxx
PREFIX keywords: <http://purl.uniprot.org/keywords/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT DISTINCT ?link
WHERE
{
?protein a up:Protein .
?protein up:classifiedWith keywords:5 .
?protein rdfs:seeAlso ?link .
?link up:database ?db .
?db up:category '3D structure databases'
}
7: Select reviewed UniProtKB entries (Swiss-Prot), and their recommended protein name, that have a preferred gene name that contains the text 'DNA'
xxxxxxxxxx
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?name
WHERE
{
?protein a up:Protein .
?protein up:reviewed true .
?protein up:recommendedName ?recommended .
?recommended up:fullName ?name .
?protein up:encodedBy ?gene .
?gene skos:prefLabel ?text .
FILTER CONTAINS(?text, 'DNA')
}
8: Select the preferred gene name and disease annotation of all human UniProtKB entries that are known to be involved in a disease
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?name ?text
WHERE
{
?protein a up:Protein .
?protein up:organism taxon:9606 .
?protein up:encodedBy ?gene .
?gene skos:prefLabel ?name .
?protein up:annotation ?annotation .
?annotation a up:Disease_Annotation .
?annotation rdfs:comment ?text
}
9: Select all human UniProtKB entries with a sequence variant that leads to a 'loss of function'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?text
WHERE
{
?protein a up:Protein .
?protein up:organism taxon:9606 .
?protein up:annotation ?annotation .
?annotation a up:Natural_Variant_Annotation .
?annotation rdfs:comment ?text .
FILTER (CONTAINS(?text, 'loss of function'))
}
10: Select all human UniProtKB entries with a sequence variant that leads to a tyrosine to phenylalanine substitution
xxxxxxxxxx
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?annotation ?begin ?text
WHERE
{
?protein a up:Protein ;
up:organism taxon:9606 ;
up:annotation ?annotation .
?annotation a up:Natural_Variant_Annotation ;
rdfs:comment ?text ;
up:substitution ?substitution ;
up:range/faldo:begin
[ faldo:position ?begin ;
faldo:reference ?sequence ] .
?sequence rdf:value ?value .
BIND (substr(?value, ?begin, 1) as ?original) .
FILTER(?original = 'Y' && ?substitution = 'F') .
}
11: Select all UniProtKB entries with annotated transmembrane regions and the regions' begin and end coordinates on the canonical sequence
xxxxxxxxxx
PREFIX faldo: <http://biohackathon.org/resource/faldo#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?begin ?end
WHERE
{
?protein a up:Protein .
?protein up:annotation ?annotation .
?annotation a up:Transmembrane_Annotation .
?annotation up:range ?range .
?range faldo:begin/faldo:position ?begin .
?range faldo:end/faldo:position ?end
}
12: Select all UniProtKB entries that were integrated on the 30th of November 2010
xxxxxxxxxx
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?protein
WHERE
{
?protein a up:Protein .
?protein up:created '2010-11-30'^^xsd:date
}
13: Was any UniProtKB entry integrated on the 9th of January 2013
xxxxxxxxxx
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
ASK
WHERE
{
?protein a up:Protein .
?protein up:created '2013-01-09'^^xsd:date
}
14: Construct new triples of the type 'HumanProtein' from all human UniProtKB entries
xxxxxxxxxx
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
CONSTRUCT
{
?protein a up:HumanProtein .
}
WHERE
{
?protein a up:Protein .
?protein up:organism taxon:9606
}
17: Select the average number of cross-references to the PDB database of UniProtKB entries that have at least one cross-reference to the PDB database
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT (AVG(?linksToPdbPerEntry) AS ?avgLinksToPdbPerEntry)
WHERE
{
SELECT ?protein (COUNT(DISTINCT ?db) AS ?linksToPdbPerEntry)
WHERE
{
?protein a up:Protein .
?protein rdfs:seeAlso ?db .
?db up:database <http://purl.uniprot.org/database/PDB> .
}
GROUP BY ?protein ORDER BY DESC(?linksToPdbPerEntry)
}
18: Select the number of UniProtKB entries for each of the EC (Enzyme Commission) top level categories
xxxxxxxxxx
PREFIX ec: <http://purl.uniprot.org/enzyme/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?ecClass (COUNT(?protein) as ?size)
WHERE
{
VALUES (?ecClass) {(ec:1.-.-.-) (ec:2.-.-.-) (ec:3.-.-.-) (ec:4.-.-.-) (ec:5.-.-.-) (ec:6.-.-.-) (ec:7.-.-.-)} .
?protein ( up:enzyme | up:domain/up:enzyme | up:component/up:enzyme ) ?enzyme .
# Enzyme subclasses are materialized, do not use rdfs:subClassOf+
?enzyme rdfs:subClassOf ?ecClass .
}
GROUP BY ?ecClass ORDER BY ?ecClass
19: Find all natural variant annotations if associated via an evidence tag to an article with a PubMed identifier
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?accession
?annotation_acc
?pubmed
WHERE
{
?protein a up:Protein ;
up:annotation ?annotation .
?annotation a up:Natural_Variant_Annotation .
?linkToEvidence rdf:object ?annotation ;
up:attribution ?attribution .
?attribution up:source ?source .
?source a up:Journal_Citation .
BIND(SUBSTR(STR(?protein),33) AS ?accession)
BIND(IF(CONTAINS(STR(?annotation), "#SIP"), SUBSTR(STR(?annotation),33), SUBSTR(STR(?annotation),36))AS?annotation_acc)
BIND(SUBSTR(STR(?source),35) AS ?pubmed)
}
20: Find how often an article in PubMed was used in an evidence tag in a human protein (ordered by most used to least)
xxxxxxxxxx
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?source
(COUNT(?attribution) AS ?attribitions)
WHERE
{
?protein a up:Protein ;
up:organism taxon:9606 ;
up:annotation ?annotation .
?linkToEvidence rdf:object ?annotation ;
up:attribution ?attribution .
?attribution up:source ?source .
?source a up:Journal_Citation .
} GROUP BY ?source ORDER BY DESC(COUNT(?attribution))
21: Find where disease related proteins are known to be located in the cell
xxxxxxxxxx
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?disease
?location_inside_cell
?cellcmpt
WHERE
{
?protein up:annotation ?diseaseAnnotation , ?subcellAnnotation .
?diseaseAnnotation up:disease/skos:prefLabel ?disease .
?subcellAnnotation up:locatedIn/up:cellularComponent ?cellcmpt .
?cellcmpt skos:prefLabel ?location_inside_cell .
}
22: For two accession numbers (ACs) find the GO term labels and group them into GO process,function and component
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX GO:<http://purl.obolibrary.org/obo/GO_>
SELECT
(CONCAT(SUBSTR(STR(?protein), 33)) AS ?uniprot)
(GROUP_CONCAT(?celtype; separator=";") AS ?celtypes)
(GROUP_CONCAT(?biotype; separator=";") AS ?biotypes)
(GROUP_CONCAT(?moltype; separator=";") AS ?moltypes)
WHERE
{
VALUES (?ac) {("Q6GZX4") ("Q96375")}
BIND (IRI(CONCAT("http://purl.uniprot.org/uniprot/",?ac)) AS ?protein)
?protein a up:Protein .
?protein up:classifiedWith ?goTerm .
#Determine if the type is biological_process
OPTIONAL {
?goTerm rdfs:subClassOf GO:0008150 .
?goTerm rdfs:label ?biotype .
}
#Determine if the type is cellular_component
OPTIONAL {
?goTerm rdfs:subClassOf GO:0005575 .
?goTerm rdfs:label ?celtype .
}
#Determine if the type is molecular_function
OPTIONAL {
?goTerm rdfs:subClassOf GO:0003674 .
?goTerm rdfs:label ?moltype .
}
#Filter out the uniprot keywords
FILTER(bound(?biotype) || bound(?celtype) || bound(?moltype))
} GROUP BY ?protein
23: Number of reviewed entries (UniProtKB/Swiss-Prot) that are related to kinase activity
xxxxxxxxxx
PREFIX GO: <http://purl.obolibrary.org/obo/GO_>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
(COUNT(DISTINCT(?protein)) AS ?pc)
WHERE
{
?protein rdf:type up:Protein ;
up:reviewed true ;
up:organism taxon:9606 ;
up:classifiedWith|(up:classifiedWith/rdfs:subClassOf) GO:0016301 .
}
24: Find the release number of the UniProt data that is currently being queried
xxxxxxxxxx
SELECT ?version
FROM <https://sparql.uniprot.org/.well-known/void>
WHERE
{
[] <http://purl.org/pav/version> ?version
}
25: Find UniProtKB entry which has a protein name 'HLA class I histocompatibility antigen, B-73 alpha chain'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName
WHERE
{
?protein a up:Protein .
?protein (up:recommendedName|up:alternativeName) ?structuredName .
?structuredName ?anyKindOfName "HLA class I histocompatibility antigen, B alpha chain" .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
26: Find UniProtKB proteins which are cleaved and contain a domain or component named 'HLA class I histocompatibility antigen, B-73 alpha chain'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName
WHERE
{
?protein a up:Protein .
?protein (up:recommendedName|up:alternativeName)|((up:domain|up:component)/(up:recommendedName|up:alternativeName)) ?structuredName .
?structuredName ?anyKindOfName "HLA class I histocompatibility antigen, B-73 alpha chain" .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
26: Find UniProtKB entry, or an UniProtKB entries domain or component which has a name 'HLA class I histocompatibility antigen, B-73 alpha chain'
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?protein ?anyKindOfName
WHERE
{
?protein a up:Protein .
?protein (up:recommendedName|up:alternativeName)|((up:domain|up:component)/(up:recommendedName|up:alternativeName)) ?structuredName .
?structuredName ?anyKindOfName "HLA class I histocompatibility antigen, B-73 alpha chain" .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}
27: Retrieve all protein names, including peptide names, associated with UniProtKB entry P05067.
xxxxxxxxxx
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT
?protein
?anyKindOfName
?names
?partType
WHERE
{
BIND(<http://purl.uniprot.org/uniprot/P05067> AS ?protein)
?protein a up:Protein .
{
?protein (up:recommendedName|up:alternativeName) ?structuredName .
}
UNION
{
VALUES(?partType){(up:domain) (up:component)}
?protein ?partType ?part .
?part (up:recommendedName|up:alternativeName) ?structuredName .
}
?structuredName ?anyKindOfName ?names .
?anyKindOfName rdfs:subPropertyOf up:structuredNameType .
}