Rで解析：Entrezの操作が楽々です！「rentrez」パッケージ

Entrezは論文のデータベースであるPubmedでおなじみのNational Center for Biotechnology Information (NCBI)が開発した検索システムです。あまりにも有名なのでEntrezの説明はしませんが、論文だけでなく、SNPも含めた遺伝子情報のデータベースもあり、J-STAGE、JDreamⅢともに重宝しています。

Entrezもそうですが、多くのウェブ上のデータベースはソフトウェアから操作するためのAPIを公開しています。APIを利用してデータを取得すると作業効率が格段に上昇します。

本パッケージはEntrezのAPIを操作してデータを取得するパッケージです。一瞬で、必要なデータを取得できます。

ただし、大量のデータを取得する行為は迷惑となりますので十分に注意してください。

パッケージのバージョンは1.2.3。実行コマンドはR version 4.2.2で確認しています。

パッケージのインストール

下記コマンドを実行してください。

#パッケージのインストール
install.packages("rentrez")

実行コマンド

詳細はコメント、パッケージヘルプを確認してください。

#&#12497;&#12483;&#12465;&#12540;&#12472;&#12398;&#35501;&#12415;&#36796;&#12415;
library("rentrez")

#&#12487;&#12540;&#12479;&#12505;&#12540;&#12473;&#12398;&#19968;&#35239;&#12434;&#21462;&#24471;:entrez_abs&#12467;&#12510;&#12531;&#12489;
entrez_dbs()
[1] "pubmed"          "protein"         "nuccore"         "nucleotide"      "nucgss"
[6] "nucest"          "structure"       "genome"          "gpipe"           "annotinfo"
[11] "assembly"        "bioproject"      "biosample"       "blastdbinfo"     "books"
[16] "cdd"             "clinvar"         "clone"           "gap"             "gapplus"
[21] "grasp"           "dbvar"           "epigenomics"     "gene"            "gds"
[26] "geoprofiles"     "homologene"      "medgen"          "mesh"            "ncbisearch"
[31] "nlmcatalog"      "omim"            "orgtrack"        "pmc"             "popset"
[36] "probe"           "proteinclusters" "pcassay"         "biosystems"      "pccompound"
[41] "pcsubstance"     "pubmedhealth"    "seqannot"        "snp"             "sra"
[46] "taxonomy"        "unigene"         "gencoll"         "gtr" 

#&#12487;&#12540;&#12479;&#12505;&#12540;&#12473;&#35201;&#32004;&#12434;&#34920;&#31034;:entrez_db_summary&#12467;&#12510;&#12531;&#12489;
entrez_db_summary("gene")

DbName: gene
MenuName: Gene
Description: Gene database
DbBuild: Build230112-2235m.1
Count: 63762982
LastUpdate: 2023/01/13 17:52 

#&#26908;&#32034;&#12391;&#20351;&#29992;&#12391;&#12365;&#12427;&#12458;&#12503;&#12471;&#12519;&#12531;&#12434;&#34920;&#31034;:entrez_db_searchable&#12467;&#12510;&#12531;&#12489;
entrez_db_searchable("gene")

Searchable fields for database 'gene'
ALL 	 All terms from all searchable fields 
UID 	 Unique number assigned to a gene record 
FILT 	 Limits the records 
TITL 	 gene or protein name 
WORD 	 Free text associated with record 
ORGN 	 scientific and common names of organism 
MDAT 	 The last date on which the record was updated 
CHR 	 Chromosome number or numbers; also 'mitochondrial', 'unknown' properties 
MV 	 Chromosomal map location as displayed in MapViewer 
GENE 	 Symbol or symbols of the gene 
ECNO 	 EC number for enzyme or CAS registry number 
MIM 	 MIM number from OMIM 
DIS 	 Name(s) of diseases associated with this gene. When available, OMIM name will be used 
ACCN 	 Nucleotide or protein accession(s) associated with this gene 
UGEN 	 UniGene cluster number for this gene 
PROP 	 Properties of Gene record 
CDAT 	 The date on which this record first appeared 
NCAC 	 nucleotide accessions of sequences 
NUID 	 nucleotide uids of sequences 
PACC 	 protein accessions 
PUID 	 protein uids 
PMID 	 PubMed ids of accessions linked to the record 
TID 	 taxonomy id 
GO 	 Gene Ontology 
DOM 	 Domain Name 
DDAT 	 The date on which the record was discontinued 
CPOS 	 Chromosome base position 
GFN 	 Gene full name 
PFN 	 Protein full name 
GL 	 Gene length 
XC 	 Exon count 
GRP 	 Relationships for this gene 
PREF 	 Preferred symbol of the gene 
AACC 	 Assembly accession 
ASM 	 Assembly name 
EXPR 	 Gene expression 

#&#12461;&#12540;&#12527;&#12540;&#12489;&#12391;&#26908;&#32034;:entrez_search&#12467;&#12510;&#12531;&#12489;
#term&#12399;Entrez&#12398;&#24418;&#24335;&#12364;&#12381;&#12398;&#12414;&#12414;&#20351;&#29992;&#12391;&#12365;&#12414;&#12377;
#&#12487;&#12540;&#12479;&#12505;&#12540;&#12473;&#12398;&#25351;&#23450;:db&#12458;&#12503;&#12471;&#12519;&#12531;
#&#32080;&#26524;&#21462;&#24471;&#25968;&#12398;&#25351;&#23450;:retmax&#12458;&#12503;&#12471;&#12519;&#12531;
r_search <- entrez_search(db = "gene", term = "MTHFR AND Homo sapiens[ORGN]", retmax = NULL)
#&#30906;&#35469;
r_search

Entrez search result with 196 hits (object contains 20 IDs and no web_history object)
Search term (as translated):  MTHFR[All Fields] AND "Homo sapiens"[Organism] 

#&#12487;&#12540;&#12479;&#27083;&#36896;&#12398;&#30906;&#35469;
str(r_search)
List of 5
$ ids             : chr [1:20] "7157" "348" "7124" "7422" ...
$ count           : int 196
$ retmax          : int 20
$ QueryTranslation: chr "MTHFR[All Fields] AND \"Homo sapiens\"[Organism]"
$ file            :Classes 'XMLInternalDocument', 'XMLAbstractDocument' <externalptr> 
  - attr(*, "class")= chr [1:2] "esearch" "list"

#&#26908;&#32034;&#32080;&#26524;&#12398;&#35201;&#32004;&#12434;&#21462;&#24471;:entrez_summar&#12467;&#12510;&#12531;&#12489;
#id&#12399;&#21462;&#24471;&#32080;&#26524;&#12392;&#12487;&#12540;&#12479;&#12364;&#26684;&#32013;&#12373;&#12428;&#12390;&#12356;&#12427;&#12522;&#12473;&#12488;&#21517;&#12398;&#32068;&#12415;&#21512;&#12431;&#12379;
GeneResult <- entrez_summary(db = "gene", id = r_search$ids)

#&#30906;&#35469;
$`7157`
esummary result with 20 items:
[1] uid                name               description        status             currentid         
[6] chromosome         geneticsource      maplocation        otheraliases       otherdesignations 
[11] nomenclaturesymbol nomenclaturename   nomenclaturestatus mim                genomicinfo       
[16] geneweight         summary            chrsort            chrstart           organism    

#&#20197;&#19979;&#30053;
#&#20363;&#12360;&#12400;&#19968;&#30058;&#30446;&#32080;&#26524;&#12398;description&#12434;&#34920;&#31034;
sapply(GeneResult, "[[", "description")[1]
&#12288;&#12288;&#12288;&#12288;&#12288;&#12288;&#12288;7157 
"tumor protein p53" 

###&#12497;&#12483;&#12465;&#12540;&#12472;&#20844;&#24335;&#12506;&#12540;&#12472;&#12424;&#12426;#####
#&#29305;&#23450;&#12461;&#12540;&#12527;&#12540;&#12489;&#12398;&#24180;&#20195;&#21029;&#35542;&#25991;&#25968;&#12434;&#21462;&#24471;&#12467;&#12510;&#12531;&#12489;&#12434;&#19968;&#37096;&#25913;&#22793;
#serch_year&#12467;&#12510;&#12531;&#12489;&#12398;&#20316;&#25104;
search_year <- function(year, term){
  query <- paste(term, "AND (", year, "[PDAT])")
  entrez_search(db = "pubmed", term = query, retmax = 0)$count
}
#&#24180;&#31684;&#22258;&#12434;&#25351;&#23450;
year <- 2008:2022
papers <- sapply(year, search_year,
                 term = "MTHFR AND Homo sapiens[ORGN]", 
                 USE.NAMES = FALSE)
plot(year, papers, type = 'b', main = "MTHFR AND Homo sapiens[ORGN]:PAPERS",
     ylim = c(0, type.convert(max(papers))*1.2))

#&#29305;&#23450;&#12461;&#12540;&#12527;&#12540;&#12489;&#12398;&#35542;&#25991;&#12434;&#21462;&#24471;&#12375;&#24773;&#22577;&#12434;&#12414;&#12392;&#12417;&#12427;
SearchResult <- entrez_search(db = "pubmed", retmax = 10,
                              term = "folic acid AND MTHFR")
#&#12472;&#12515;&#12540;&#12490;&#12523;&#21517;&#12398;&#21462;&#24471;
GetSummary <- entrez_summary(db="pubmed", id=SearchResult$ids)
#&#12479;&#12452;&#12488;&#12523;&#12434;&#21462;&#24471;
sapply(GetSummary, "[[", "title")[1]
36637428 
#"Methylenetetrahydrofolate reductase deficiency and high dose FA
#supplementation disrupt embryonic development of energy balance and metabolic
#homeostasis in zebrafish." 

#&#20316;&#26989;&#12501;&#12457;&#12523;&#12480;&#12395;&#12510;&#12540;&#12463;&#12480;&#12454;&#12531;&#12375;&#12390;HTML&#12486;&#12540;&#12502;&#12523;&#12391;&#20986;&#21147;
#&#20307;&#35009;&#12399;&#25972;&#12360;&#12390;&#12356;&#12414;&#12379;&#12435;
#entrez_summary&#12398;&#32080;&#26524;&#12399;&#12522;&#12473;&#12488;&#12391;&#36820;&#12373;&#12428;&#12414;&#12377;
#&#36890;&#24120;&#12398;&#12522;&#12473;&#12488;&#12395;&#23550;&#12377;&#12427;&#25805;&#20316;&#12391;&#12487;&#12540;&#12479;&#12434;&#12356;&#12376;&#12428;&#12414;&#12377;
cat(knitr::kable(sapply(GetSummary, "[[", "title"),
                 row.names = TRUE, caption = "TEST table",
                 col.names = c("TiTle"), format = "html"),
    file = "TEST.html", sep = "\n")

出力例

マークダウンしてHTMLテーブルで出力。体裁は整えていません。画像で紹介します。

少しでも、あなたのウェブや実験の解析が楽になりますように！！