Skip to content

Commit b3ab4d1

Browse files
authored
Merge pull request #35 from till-tietz/develop
Develop
2 parents 0e7e8fa + db6d2f7 commit b3ab4d1

14 files changed

Lines changed: 114 additions & 33 deletions

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: parsel
33
Title: Parallel Dynamic Web-Scraping Using 'RSelenium'
4-
Version: 0.1.1
4+
Version: 0.2.0
55
Authors@R: c(
66
person("Till", "Tietz", email = "ttietz2014@gmail.com", role = c("cre","aut")))
77
Description: A system to increase the efficiency of dynamic web-scraping with 'RSelenium'

NEWS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
#parsel 0.2.0
2+
3+
* Added 'RSelenium' constructor functions. These functions are wrappers around 'RSelenium' methods that allow you to quickly and easily render safe, ready to use
4+
'RSelenium' scraping code to the console and paste it into your scraping functions.
5+
Constructors can be piped together via `%>>%` to allow for intuitive, sequential
6+
construction of scraping code.
7+
8+
19
# parsel 0.1.0
210

311
* Added a `NEWS.md` file to track changes to the package.

R/constructors_elements.R

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ gen_varname <- function(input){
2929
#' @param using character string specifying locator scheme to use to search elements. Available schemes: "class name", "css selector", "id", "name", "link text", "partial link text", "tag name", "xpath".
3030
#' @param value character string specifying the search target.
3131
#' @param name character string specifying the object name the RSelenium "wElement" class object should be saved to.
32-
#' @param new_page logical indicating if clickElement() action will resullt in a change in url.
32+
#' @param new_page logical indicating if clickElement() action will result in a change in url.
3333
#' @param prev a placeholder for the output of functions being piped into click(). Defaults to NULL and should not be altered.
3434
#' @return a character string defining 'RSelenium' clicking instructions that can be pasted into a scraping function.
3535
#' @export
@@ -125,7 +125,7 @@ click <- function(using, value, name = NULL, new_page = FALSE, prev = NULL){
125125
#' @param name character string specifying the object name the RSelenium "wElement" class object should be saved to.If NULL a name will be generated automatically.
126126
#' @param text a character vector specifying the text to be typed.
127127
#' @param text_object a character string specifying the name of an external object holding the text to be typed. Note that the remDr$sendKeysToElement method only accepts list inputs.
128-
#' @param new_page logical indicating if sendKeysToElement() action will resullt in a change in url.
128+
#' @param new_page logical indicating if sendKeysToElement() action will result in a change in url.
129129
#' @param prev a placeholder for the output of functions being piped into type(). Defaults to NULL and should not be altered.
130130
#' @return a character string defining 'RSelenium' typing instructions that can be pasted into a scraping function.
131131
#' @export
@@ -271,8 +271,8 @@ type <- function(using, value, name = NULL, text, text_object, new_page = FALSE,
271271
#' @param using character string specifying locator scheme to use to search elements. Available schemes: "class name", "css selector", "id", "name", "link text", "partial link text", "tag name", "xpath".
272272
#' @param value character string specifying the search target.
273273
#' @param name character string specifying the object name the RSelenium "wElement" class object should be saved to. If NULL a name will be generated automatically.
274+
#' @param multiple logical indicating whether multiple elements should be returned. If TRUE the findElements() method will be invoked.
274275
#' @param prev a placeholder for the output of functions being piped into get_element(). Defaults to NULL and should not be altered.
275-
#' @param multiple logical indicating whether multiple elements should be returned. If TRUE the findElements() method will be invoced.
276276
#' @return a character string defining 'RSelenium' getElementText() instructions that can be pasted into a scraping function.
277277
#' @export
278278
#'
@@ -342,34 +342,32 @@ get_element <- function(using, value, name = NULL, multiple = FALSE, prev = NULL
342342
}
343343

344344

345-
if(multiple == FALSE){
345+
if(multiple){
346346

347-
finding <- paste(name, " <- ", "try(", "remDr$findElement(using = '", using,"', '", value, "')", ")", sep = "")
347+
finding <- paste(name, " <- ", "try(", "remDr$findElements(using = '", using,"', '", value, "')", ")", sep = "")
348348

349349
out <- paste(finding,
350350
paste("if(is(", name, ",'try-error')){", sep = ""),
351351
paste(name, " <- NA", sep = ""),
352352
"} else {",
353-
paste(name, " <- ", name,"$getElementText()", sep = ""),
353+
paste(name, " <- ", "lapply(", name, ", function(i) ","i$getElementText())", sep = ""),
354354
"}",
355355
sep = " \n")
356356

357357
} else {
358358

359-
finding <- paste(name, " <- ", "try(", "remDr$findElements(using = '", using,"', '", value, "')", ")", sep = "")
359+
finding <- paste(name, " <- ", "try(", "remDr$findElement(using = '", using,"', '", value, "')", ")", sep = "")
360360

361361
out <- paste(finding,
362362
paste("if(is(", name, ",'try-error')){", sep = ""),
363363
paste(name, " <- NA", sep = ""),
364364
"} else {",
365-
paste(name, " <- ", "lapply(", name, ", function(i) ","i$getElementText())", sep = ""),
365+
paste(name, " <- ", name,"$getElementText()", sep = ""),
366366
"}",
367367
sep = " \n")
368368

369369
}
370370

371-
372-
373371
if(!is.null(prev)){
374372
out <- paste(prev, out, sep = " \n \n ")
375373
}

R/constructors_navigation.R

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#' wrapper around remDr$mavigate method to generate safe navigation code
1+
#' wrapper around remDr$navigate method to generate safe navigation code
22
#'
33
#' @param url a character string specifying the name of the object holding the url string or the url string the function should navigate to.
44
#' @param prev a placeholder for the output of functions being piped into go(). Defaults to NULL and should not be altered.
@@ -82,3 +82,43 @@ goback <- function(prev = NULL){
8282
return(out)
8383
}
8484

85+
86+
#' wrapper around remDr$goForward method to generate safe forwards navigation code
87+
#'
88+
#' @param prev a placeholder for the output of functions being piped into goforward(). Defaults to NULL and should not be altered.
89+
#' @return a character string defining 'RSelenium' forward navigation instructions that can be pasted into a scraping function.
90+
#' @export
91+
#'
92+
#' @examples
93+
#' \dontrun{
94+
#'
95+
#' goforward() %>>%
96+
#' show()
97+
#'
98+
#' }
99+
100+
goforward <- function(prev = NULL){
101+
102+
not_forward <- "not_forward <- TRUE"
103+
104+
from <- "from <- seleniumPipes::getCurrentUrl(remDr)"
105+
106+
go_forward <- "remDr$goForward()"
107+
108+
while_loop <- paste("while(not_forward){",
109+
"Sys.sleep(0.25)",
110+
"current <- seleniumPipes::getCurrentUrl(remDr)",
111+
"if(current != from){",
112+
"not_forward <- FALSE",
113+
"}",
114+
"}",
115+
sep = "\n")
116+
117+
out <- paste("# navigate forward to new url", not_forward, from, go_forward, while_loop, sep = "\n")
118+
119+
if(!is.null(prev)){
120+
out <- paste(prev, out, sep = " \n \n ")
121+
}
122+
123+
return(out)
124+
}

R/constructors_pipe.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
#' pipe-like operator that passes the ouput of lhs to the prev argument of rhs to paste together a scraper function in sequence.
1+
#' pipe-like operator that passes the output of lhs to the prev argument of rhs to paste together a scraper function in sequence.
22
#'
33
#' @param lhs a parsel constructor function call
4-
#' @param rhs a parsel constructor fuction call that should accept lhs as its prev argument
4+
#' @param rhs a parsel constructor function call that should accept lhs as its prev argument
55
#' @return the output of rhs evaluated with lhs as the prev argument
66
#' @export
77
#'

R/parscrape.R

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,16 @@
3939
#'}
4040

4141

42-
parscrape <- function(scrape_fun, scrape_input, cores = NULL, packages = c("base"), browser, ports = NULL, chunk_size = NULL, scrape_tries = 1, proxy = NULL, extraCapabilities = list()) {
42+
parscrape <- function(scrape_fun,
43+
scrape_input,
44+
cores = NULL,
45+
packages = c("base"),
46+
browser,
47+
ports = NULL,
48+
chunk_size = NULL,
49+
scrape_tries = 1,
50+
proxy = NULL,
51+
extraCapabilities = list()) {
4352

4453

4554
if(!is.function(scrape_fun)){

cran-comments.md

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,4 @@
1-
## Resubmission
2-
This is a resubmission. In this version I have:
3-
4-
* written all package names in single quotes. Changed RSelenium to 'RSelenium' in Description title.
5-
* added \value tag to close_rselenium.Rd
6-
* converted close_rselenium to internal function
7-
* added executable example to parscrape.Rd
8-
* ensured that examples and tests use at most 2 cores
1+
This is a minor release introducing new user facing functionality.
92

103
## Test environments
114
* local Windows install, R 4.1.1
@@ -18,6 +11,6 @@ This is a resubmission. In this version I have:
1811

1912
## R CMD check results
2013

21-
0 errors | 0 warnings | 1 note
14+
0 errors | 0 warnings | 0 note
15+
2216

23-
* This is a new release.

man/click.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/get_element.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/go.Rd

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)