vignettes/how-to-scrape.Rmd
how-to-scrape.Rmd
library(rvest)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(xml2)
rv_doc <- rvest::read_html("https://www.churchofjesuschrist.org/study/liahona/2020/11/15cook?lang=eng")
rv_doc %>%
html_elements(".body-block") %>%
xml2::html_structure()
#> [[1]]
#> <div.body-block>
#> <p#p5 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p6 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p7 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p8 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p9 [data-aid]>
#> {text}
#> <p#p42 [data-aid]>
#> {text}
#> <p#p10 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p43 [data-aid]>
#> {text}
#> <p#p44 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p11 [data-aid]>
#> {text}
#> <span.page-break [data-page]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p12 [data-aid]>
#> <em>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p13 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p14 [data-aid]>
#> <em>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p15 [data-aid]>
#> {text}
#> <p#p16 [data-aid]>
#> {text}
#> <a.scripture-ref [href]>
#> {text}
#> {text}
#> <p#p17 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p18 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p19 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p38 [data-aid]>
#> {text}
#> <p#p39 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p20 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p21 [data-aid]>
#> {text}
#> <span.page-break [data-page]>
#> {text}
#> <p#p22 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p23 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p24 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <em>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p40 [data-aid]>
#> {text}
#> <p#p41 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p25 [data-aid]>
#> {text}
#> <a.scripture-ref [href]>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p26 [data-aid]>
#> {text}
#> <p#p27 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p28 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p29 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p30 [data-aid]>
#> {text}
#> <a.scripture-ref [href]>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p31 [data-aid]>
#> {text}
#> <a.scripture-ref [href]>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p32 [data-aid]>
#> {text}
#> <p#p33 [data-aid]>
#> {text}
#> <span.page-break [data-page]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p34 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> {text}
#> <p#p35 [data-aid]>
#> {text}
#> <p#p36 [data-aid]>
#> {text}
#> <a.note-ref [to, href, data-scroll-id]>
#> <sup.marker>
#> {text}
#> <p#p37 [data-aid]>
#> {text}
Explore node 1:
rv_doc %>%
html_elements(".body-block") %>%
xml2::xml_child(1)
#> {html_node}
#> <p data-aid="144618637" id="p5">
#> [1] <a class="note-ref" to="[object Object]" href="#note1" data-scroll-id="no ...
Explore node 2:
rv_doc %>%
html_elements(".body-block") %>%
xml2::xml_child(2)
#> {html_node}
#> <p data-aid="144618639" id="p6">
#> [1] <a class="note-ref" to="[object Object]" href="#note2" data-scroll-id="no ...
rv_doc %>%
html_elements(".body-block") %>%
xml_contents()
#> {xml_nodeset (40)}
#> [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
#> [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
#> [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
#> [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore ...
#> [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
#> [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
#> [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
#> [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
#> [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father ...
#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
#> ...
rv_doc %>%
html_elements(".body-block p")
#> {xml_nodeset (40)}
#> [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
#> [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
#> [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
#> [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore ...
#> [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
#> [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
#> [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
#> [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
#> [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father ...
#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
#> ...
rv_doc %>%
html_elements(".body-block") %>%
html_children()
#> {xml_nodeset (40)}
#> [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
#> [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
#> [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
#> [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore ...
#> [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
#> [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
#> [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
#> [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
#> [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father ...
#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
#> ...
rv_doc %>%
html_elements("header")
#> {xml_nodeset (7)}
#> [1] <header class="panelHeader-2k7Jd backToAll-1PgB6"><a class="backText-1xON ...
#> [2] <header class="panelHeader-2k7Jd contentHead-3F0ox"><button class="sc-1g7 ...
#> [3] <header class="bookmarkHeader-2Bn20"><span class="bookmarkManagerTitle-1U ...
#> [4] <header class="downloadHead-3O2wO">Downloads</header>
#> [5] <header class="settingsHead-3iDND">Footnotes</header>
#> [6] <header class="settingsHead-3iDND">Theme</header>
#> [7] <header><span class="page-break" data-page="18"></span><div class="bvqtyr ...
rv_doc %>%
html_elements(".body") %>%
html_elements("header") %>%
html_text2()
#> [1] "Hearts Knit in Righteousness and Unity\n\nBy Elder Quentin L. Cook\n\nOf the Quorum of the Twelve Apostles\n\nAt this 200-year hinge point in our Church history, let us commit ourselves to live righteously and be united as never before."
Get specific paragraph by id:
rv_doc %>%
html_elements("#p5")
#> {xml_nodeset (1)}
#> [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly si ...
Get multiple things at the same time (headers and paragraphs):
rv_doc %>%
html_elements(".body-block h2, .body-block p")
#> {xml_nodeset (40)}
#> [1] <p data-aid="144618637" id="p5">Righteousness and unity are profoundly s ...
#> [2] <p data-aid="144618639" id="p6">As a young man not of our faith, General ...
#> [3] <p data-aid="144618644" id="p7">In 1872, General Kane, his talented wife ...
#> [4] <p data-aid="144618648" id="p8">During the trip they stayed in Fillmore ...
#> [5] <p data-aid="144618651" id="p9">Elizabeth wrote that as Matilda was prep ...
#> [6] <p data-aid="144618656" id="p42">Matilda’s son’s reply was, “She said ‘T ...
#> [7] <p data-aid="144618659" id="p10">Elizabeth asked, “Will she really do th ...
#> [8] <p data-aid="144618665" id="p43">Matilda’s son answered, “Mother will se ...
#> [9] <p data-aid="144618668" id="p44">And so she did, and “they ate with perf ...
#> [10] <p data-aid="144618672" id="p11">As leaders, we are not under the illusi ...
#> [11] <p data-aid="144618676" id="p12"><em>Righteousness</em> is a broad, comp ...
#> [12] <p data-aid="144618679" id="p13">Being righteous is not dependent on eac ...
#> [13] <p data-aid="144618685" id="p14"><em>Unity</em> is also a broad, compreh ...
#> [14] <p data-aid="144618690" id="p15">The context for my message is the contr ...
#> [15] <p data-aid="144618696" id="p16">It has been 200 years since the Father ...
#> [16] <p data-aid="144618701" id="p17">The historical record we read in 4 Neph ...
#> [17] <p data-aid="144618706" id="p18">With respect to unity, 4 Nephi reads, “ ...
#> [18] <p data-aid="144618710" id="p19">Unfortunately, 4 Nephi then describes a ...
#> [19] <p data-aid="144618715" id="p38">“But O my son, how can a people like th ...
#> [20] <p data-aid="144618720" id="p39">“How can we expect that God will stay h ...
#> ...
header_ids <- rv_doc %>%
html_elements(".body-block h2") %>%
html_attr("id")
p_ids <- rv_doc %>%
html_elements(".body-block p") %>%
html_element("#p1")
xm_contents <- rv_doc %>%
html_elements(".body-block") %>%
xml_contents()
rv_doc %>%
html_elements(".body-block") %>%
# html_children() %>%
xml_child(1) %>%
xml_contents() %>%
html_elements("p")
#> {xml_nodeset (0)}
Scrape metadata for url
rv_doc %>%
html_elements("head") %>%
html_elements("meta")
#> {xml_nodeset (10)}
#> [1] <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n
#> [2] <meta charset="utf-8">\n
#> [3] <meta name="viewport" content="width=device-width,initial-scale=1">\n
#> [4] <meta data-react-helmet="true" name="Search.doc-aid" content="144618619">\n
#> [5] <meta data-react-helmet="true" name="title" content="Hearts Knit in Righ ...
#> [6] <meta data-react-helmet="true" name="description" content="Elder Cook en ...
#> [7] <meta data-react-helmet="true" property="og:image" content="https://medi ...
#> [8] <meta data-react-helmet="true" property="og:title" content="Hearts Knit ...
#> [9] <meta data-react-helmet="true" property="og:type" content="website">\n
#> [10] <meta data-react-helmet="true" property="og:url" content="https://www.ch ...