There are a couple of components requiring processing when reading XML file in R : root and child node, attributes and values.
Packages required for reading xml data
library(XML) # required for reading xml data.
library(xml2)
User-defined Functions (UDFs)
I am sharing several UDFs for reading XML file in R.
Function to parse simple non-nested xml data without attributes
parseXMLSimpleTextDataasDF <- function(xmlFile, xmlTag) {
# read xml file
xml <- read_xml(xmlFile)
# parse xml file
xml <- xmlParse(xml)
# covert to data frame
xml_df <- xmlToDataFrame(nodes=getNodeSet(xml, xmlTag))
# return parsed xml file as data frame
return(xml_df)
}
Function to extract both attributes and text from simple XML data using xpath
.
extractXMLTextwithAttributes <- function(xmlFile, xmlTag) {
# read xml file
xml <- read_xml(xmlFile)
# find all nodes that match doc and extract the attribute value
doc <- xml_find_all(xml, xpath = xmlTag)
# retrieve the value of a single attribute, in our case the "id" attribute
doc_id <- xml_attr(doc, "id")
# extra the text for the matched XML child node
text <- xml_text(xml_find_all(xml,xpath=paste0(xmlTag,"/text")))
#create a tibble then covert to data frame
df = tibble(doc_id, text) %>% as.data.frame()
# return extracted attributes as data frame
return(df)
}
Function to read nested xml (tree-like) and extract both attributes and text through ancestor
.
extractAnnotationsXML <- function(xmlFile) {
# read xml file
xml <- read_xml(xmlFile)
#find all edge nodes
edge.nodes <- xml_find_all(xml, ".//doc")
#build the data.frame
#build the data.frame
df <- data.frame(Source = xml_find_first( edge.nodes, ".//ancestor::diseases") %>% xml_attr("source"),
diseaseName = xml_find_first( edge.nodes, ".//ancestor::disease") %>% xml_attr("name"),
doc_id = edge.nodes %>% xml_attr("id"),
judgment = edge.nodes %>% xml_attr("judgment"))
return(df)
}