我正在尝试在单个目录中读取多个XML文件,提取数据并清理一列,然后与现有数据框进行查找/合并。
我在这里还看到了其他问题,它们既可以读取多个文件又可以清除列数据,但仍然无法正常工作。
我要:
我想从XML中提取的术语(是Gutenberg RDF项目,但转换为XML似乎更容易使用):
因此,提取的XML的数据框架应如下所示:
xmlframe
Title Filenumber Downloads
Complete works of Shakespeare 100 3369
Robinson Crusoe 12623 2189
Prisoner of Zenda 95 25
然后我想将其与原始数据框合并:
original_data
Title Filenumber Downloads Status
Complete works of Shakespeare 100 4790 SUCCESS
Robinson Crusoe 12623 1978 SUCCESS
Prisoner of Zenda 95 50 SUCCESS
产生:
merged
Title Filenumber Downloads Status Downloads_2018 Status_2018
Complete works of Shakespeare 100 4790 SUCCESS 3369
SUCCESS
Robinson Crusoe 12623 1978 SUCCESS 2189
SUCCESS
Prisoner of Zenda 95 50 SUCCESS 25 FAILURE
如果我尝试根据此处的其他答案将其作为单独的XML文件进行处理,但仍然无法正常工作,在清理电子书列时,它只会显示1:
library(xml2)
pg <- read_xml("/Users/username/example/100/pg100.xml")
#get title
recs <- xml_find_all(pg, "//dcterms:title")
vals <- trimws(xml_text(recs))
xmlframe <- data.frame(vals)
#get file number, though this needs cleaning
recs2 <- xml_find_all(pg, "//pgterms:ebook/@rdf:about")
vals2 <- trimws(xml_text(recs2))
xmlframe$filenumber <- data.frame(vals2)
#get total downloads
recs3 <- xml_find_all(pg, "//pgterms:downloads")
vals3 <- trimws(xml_text(recs3))
xmlframe$downloads <- data.frame(vals3)
xmlframe <- data.frame(xmlframe)
head(xmlframe)
vals vals2 vals3
1 The Complete Works of William Shakespeare 3356 ebooks/100
编辑示例XML:
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xml:base="http://www.gutenberg.org/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:cc="http://web.resource.org/cc/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:pgterms="http://www.gutenberg.org/2009/pgterms/"
xmlns:dcam="http://purl.org/dc/dcam/"
>
<cc:Work rdf:about="">
<cc:license rdf:resource="https://creativecommons.org/publicdomain/zero/1.0/"/>
<rdfs:comment>Archives containing the RDF files for *all* our books can be downloaded at
http://www.gutenberg.org/wiki/Gutenberg:Feeds#The_Complete_Project_Gutenberg_Catalog </rdfs:comment>
</cc:Work>
<pgterms:ebook rdf:about="ebooks/100">
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.epub.images">
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2458527 </dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N010770d4a6d74aa5b55c39eb855d655c">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip </rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T14:40:58.596508 </dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/100/100-h.zip">
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T11:18:00 </dcterms:modified>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2279312 </dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N89bc524c08b54a6594240ccf8818fb58">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html; charset=utf-8 </rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:format>
<rdf:Description rdf:nodeID="Ncf9c66f72ad34bbd847dae7f9508e087">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/zip </rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:rights>Copyrighted. Read the copyright notice inside this book for details. </dcterms:rights>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/100/100-0.txt">
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T11:16:50 </dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="Nefb81f5f85714c5c964f3589029cb59f">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=utf-8 </rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">5852404 </dcterms:extent>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/100/100-h/100-h.htm">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">7033656 </dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T11:17:18 </dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N615edc4ab2e74da8a6962978ead1dc6a">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html; charset=utf-8 </rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">1994-01-01 </dcterms:issued>
<dcterms:title>The Complete Works of William Shakespeare </dcterms:title>
<dcterms:publisher>Project Gutenberg </dcterms:publisher>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.epub.noimages">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2458527 </dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T14:41:02.065471 </dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N0a45571345a449c89c177f40d94e0c15">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip </rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/100/100-0.zip">
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T11:18:00 </dcterms:modified>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2166912 </dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N024b7e4e178442eebbe8e86aa362a6f1">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=utf-8 </rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:format>
<rdf:Description rdf:nodeID="Nf9d7bb6beb29471795e9096c3716acfe">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/zip </rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:language>
<rdf:Description rdf:nodeID="N869dca334d39425c983a629d1a8a3538">
<rdf:value rdf:datatype="http://purl.org/dc/terms/RFC4646">en </rdf:value>
</rdf:Description>
</dcterms:language>
<dcterms:subject>
<rdf:Description rdf:nodeID="N0fad3feb8fdb4f58824d041fccd1b083">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
<rdf:value>English drama -- Early modern and Elizabethan, 1500-1600 </rdf:value>
</rdf:Description>
</dcterms:subject>
<dcterms:creator>
<pgterms:agent rdf:about="2009/agents/65">
<pgterms:webpage rdf:resource="http://en.wikipedia.org/wiki/William_Shakespeare"/>
<pgterms:deathdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1616 </pgterms:deathdate>
<pgterms:alias>Shakspere, William </pgterms:alias>
<pgterms:birthdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1564 </pgterms:birthdate>
<pgterms:name>Shakespeare, William </pgterms:name>
<pgterms:alias>Shakspeare, William </pgterms:alias>
</pgterms:agent>
</dcterms:creator>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.kindle.noimages">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">10189331 </dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N3ab0025af98e43c980950490d2aa3c12">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook </rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T14:41:57.941535 </dcterms:modified>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.kindle.images">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">10189339 </dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="Nafdd31a0c146463bb289f2b2d6176a92">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook </rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-06-14T14:41:29.544991 </dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:subject>
<rdf:Description rdf:nodeID="Nf1c1a4b50f9d45d7821119dc1ef4c503">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCC"/>
<rdf:value>PR </rdf:value>
</rdf:Description>
</dcterms:subject>
<pgterms:downloads rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">3356 </pgterms:downloads>
<dcterms:type>
<rdf:Description rdf:nodeID="Nff80c94b0b52431a817a58ead14dfc2b">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/DCMIType"/>
<rdf:value>Text </rdf:value>
</rdf:Description>
</dcterms:type>
<pgterms:bookshelf>
<rdf:Description rdf:nodeID="N2d8aac020c354487bbbbc07d6aec32d1">
<dcam:memberOf rdf:resource="2009/pgterms/Bookshelf"/>
<rdf:value>Plays </rdf:value>
</rdf:Description>
</pgterms:bookshelf>
<dcterms:license rdf:resource="license"/>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/100.rdf">
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2018-07-12T05:00:06.369359 </dcterms:modified>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">10868 </dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/100"/>
<dcterms:format>
<rdf:Description rdf:nodeID="Ndf4a9f6482974e3b9b818ce0b34c988d">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/rdf+xml </rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
</pgterms:ebook>
<rdf:Description rdf:about="http://en.wikipedia.org/wiki/William_Shakespeare">
<dcterms:description>Wikipedia </dcterms:description>
</rdf:Description>
</rdf:RDF>
答案 0 :(得分:1)
只需使用gsub
删除不需要的文本,然后在解析所有需要的值后调用data.frame
pg <- read_xml("/Users/username/example/100/pg100.xml")
#get title
recs <- xml_find_all(pg, "//dcterms:title")
vals <- trimws(xml_text(recs))
#get file number, though this needs cleaning
recs2 <- xml_find_all(pg, "//pgterms:ebook/@rdf:about")
vals2 <- gsub("ebooks/", "", trimws(xml_text(recs2)))
#get total downloads
recs3 <- xml_find_all(pg, "//pgterms:downloads")
vals3 <- trimws(xml_text(recs3))
xmlframe <- data.frame(Title=vals, Filenumber_2018=vals2, Downloads_2018=vals3)
xmlframe
# Title Filenumber_2018 Downloads_2018
# 1 The Complete Works of William Shakespeare 100 3356
要遍历许多文件,请在函数调用中包装以上步骤,然后对所有XML文件运行lapply
。
process_xml <- function(xml_path) {
pg <- read_xml(xml_path)
#get title
recs <- xml_find_all(pg, "//dcterms:title")
vals <- trimws(xml_text(recs))
#get file number, though this needs cleaning
recs2 <- xml_find_all(pg, "//pgterms:ebook/@rdf:about")
vals2 <- gsub("ebooks/", "", trimws(xml_text(recs2)))
#get total downloads
recs3 <- xml_find_all(pg, "//pgterms:downloads")
vals3 <- trimws(xml_text(recs3))
xmlframe <- data.frame(Title=vals, Filenumber_2018=vals2, Downloads_2018=vals3)
}
# GET XML FILE PATH NAMES (RECURSIVE FOR SUBDIRECTORIES)
xml_files <- list.files(path="/Users/username/example", pattern=".xml",
full.names = TRUE, recursive = TRUE)
# LIST OF DATAFRAMES (TRYCATCH IN CASE OF PARSING ERRORS TO RETURN NULL)
df_list <- lapply(xml_files, function(x)
tryCatch(process_xml(x), error=function(e) NULL))
# REMOVE NULL ELEMENTS (I.E., ERRORS ABOVE)
df_list <- Filter(NROW, df_list)
# APPEND ALL DATAFRAMES
master_xml_df <- do.call(rbind, df_list)
# MERGE WITH ORIGINAL DATA
final_df <- merge(original_data, master_xml_df, by="Title")