-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexamples.R
159 lines (123 loc) · 4.79 KB
/
examples.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
## devtools::install_github('ultinomics/xmltools')
## library(xmltools)
library(xmltools)
empty_as_na <- function(x){
if("factor" %in% class(x)) x <- as.character(x) ## since ifelse wont work with factors
if(class(x) == "character") ifelse(as.character(x)!="", x, NA) else x
}
# USING ebay.xml ------------------------------------------------
# load the data
file <- system.file("extdata", "ebay.xml", package = "xmlExtras")
doc <- file %>%
xml2::read_xml()
nodeset <- doc %>%
xml2::xml_children() # get top level nodeset
# `xml_view_tree` structure
## we can get a tree for each node of the doc
doc %>%
xml_view_tree()
doc %>% # we can also vary the depth
xml_view_tree(depth = 2)
## easier to read and understand than `xml2::xml_structure()` and has the `depth` option
nodeset[1] %>% xml2::xml_structure()
## or, we can extract from nodesets
class(nodeset[1])
nodeset[1] %>%
xml_view_trees()
nodeset[1] %>%
xml_view_trees(depth=2)
## will not work with class "xml_node" (can't use lapply on those, apparently)
class(nodeset[[1]])
try(nodeset[[1]] %>%
xml_view_tree()
)
## one can see all the paths per node of a doc
doc %>%
xml_get_paths()
## can look at one nodeset
## NOTE that nodesets can vary, so looking at one doesn't mean you'll find all feasible paths
nodeset[1] %>%
xml_get_paths()
nodeset[1] %>%
xml_get_paths(mark_terminal = ">>") # can mark terminal nodes
## we can find all feasible paths then collapse
terminal <- doc %>% ## get all xpaths
xml_get_paths()
xpaths <- terminal %>% ## collapse xpaths to unique only
unlist() %>%
unique()
## but what we really want is the parent node of terminal nodes.
## use the `only_terminal_parent = TRUE` to do this
terminal_parent <- doc %>% ## get all xpaths to parents of parent node
xml_get_paths(only_terminal_parent = TRUE)
terminal_xpaths <- terminal_parent %>% ## collapse xpaths to unique only
unlist() %>%
unique()
## xmlToDataFrame works great on terminal nodes IF there are no non-terminal nodes any deeper.
## we extract a data frame for each parent of terminal nodes
df0 <- lapply(terminal_xpaths, function(x) {
doc <- file %>% XML::xmlInternalTreeParse()
nodeset <- XML::getNodeSet(doc, x)
XML::xmlToDataFrame(nodeset, stringsAsFactors = FALSE) %>%
dplyr::as_data_frame()
})
## problem with xmlToDataFrame is it keeps digging into other nodes recursively in "/root/listing"
xpaths[1] # /root/listing is terminal parent but xmlToDataFrame keeps digging
df0[[1]] %>%
dplyr::select(seller_info) # not good; keeps diving into other nodes but fails to separate
xpaths[2]
df0[[2]] # works because the recursive dig down hits only the terminal nodes
# xml_to_df (XML package based)
## does not dig by default
## use the terminal xpaths to get data frames
terminal_xpaths
## we send each terminal xpath to `xml_to_df`.
## the file source is the parsed xml object `doc`, so we set `is_xml = TRUE`
## we do no want to dig, which quickly gets us the data we want for each terminal xpath `dig = FALSE` (default)
df1 <- lapply(terminal_xpaths, xml_to_df, file = doc, is_xml = TRUE, dig = FALSE) %>%
dplyr::bind_cols()
# xml_dig_df (xml2 package based)
terminal_nodesets <- lapply(terminal_xpaths, xml2::xml_find_all, x = doc)
df2 <- terminal_nodesets %>%
purrr::map(xml_dig_df) %>% ## does not dig by default
purrr::map(dplyr::bind_rows) %>%
dplyr::bind_cols() %>%
dplyr::mutate_all(empty_as_na)
## they're the same!
identical(df1, data.table::as.data.table(df2))
# USING wsu.xml ------------------------------------------------
# larger file
# using xml_to_df
file <- "http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/data/courses/wsu.xml"
doc <- file %>%
xml2::read_xml()
nodeset <- doc %>%
xml2::xml_children()
length(nodeset) # lots of nodes!
nodeset[1] %>% # lets look at ONE node's tree
xml_view_tree()
## takes a long time. likely can extract from a single node
# terminal_paths <- doc %>% ## get the xpath to parents of terminal node
# xml_get_paths(only_terminal_parent = TRUE)
# lets assume that most nodes share the same structure
terminal_paths <- nodeset[1] %>%
xml_get_paths(only_terminal_parent = TRUE)
terminal_xpaths <- terminal_paths %>% ## collapse xpaths to unique only
unlist() %>%
unique()
# xml_to_df (XML package based)
## note that we use file, not doc, hence is_xml = FALSE
df1 <- lapply(terminal_xpaths, xml_to_df, file = file, is_xml = FALSE, dig = FALSE) %>%
dplyr::bind_cols()
df1
# xml_dig_df (xml2 package based)
## faster!
terminal_nodesets <- lapply(terminal_xpaths, xml2::xml_find_all, x = doc) # use xml docs, not nodesets! I think this is because it searches the 'root'.
df2 <- terminal_nodesets %>%
purrr::map(xml_dig_df) %>%
purrr::map(dplyr::bind_rows) %>%
dplyr::bind_cols() %>%
dplyr::mutate_all(empty_as_na)
df2
# they're the same!
identical(df1, data.table::as.data.table(df2))