phrmendes · August 21, 2022 20:48
diff --git a/get_from_pdf.R b/get_from_pdf.R
 # packages ----

 packages <- c("curl", "glue", "pdftools", "stringr", "tibble", "purrr", "tesseract")

 install.packages(packages)

 invisible(lapply(packages, require, character.only = TRUE))

 # function ----

 get_from_pdf <- function(x) {
  
  regex_cnpj <- "[0-9]{2}.[0-9]{3}.[0-9]{3}/[0-9]{4}-[0-9]{2}"

  regex_cep <- "[0-9]{5}-[0-9]{3}"
  
  temp <- tempfile()

  pdf <- curl::curl_download(
    x,
    glue::glue("{temp}.pdf")
  )
  
  txt <- pdftools::pdf_ocr_text(
    pdf,
    dpi = 600,
    language = "por",
    pages = n # p
  ) |>
    stringr::str_flatten()

  df <- tibble::tibble(
    cep = stringr::str_extract(
      txt,
      pattern = regex(paste0(regex_cep))
    ),
    cnpj = stringr::str_extract(
      txt,
      pattern = regex(paste0(regex_cnpj))
    )
  )

  return(df)
 }

 # request ----

 urls <- c("url.com/x.pdf", "url.com/y.pdf") 

 df <- purrr::map_dfr(
  urls,
  ~ get_from_pdf(.x)
 )
	# packages ----

	packages <- c("curl", "glue", "pdftools", "stringr", "tibble", "purrr", "tesseract")

	install.packages(packages)

	invisible(lapply(packages, require, character.only = TRUE))

	# function ----

	get_from_pdf <- function(x) {

	regex_cnpj <- "[0-9]{2}.[0-9]{3}.[0-9]{3}/[0-9]{4}-[0-9]{2}"

	regex_cep <- "[0-9]{5}-[0-9]{3}"

	temp <- tempfile()

	pdf <- curl::curl_download(
	x,
	glue::glue("{temp}.pdf")
	)

	txt <- pdftools::pdf_ocr_text(
	pdf,
	dpi = 600,
	language = "por",
	pages = n # p
	) \|>
	stringr::str_flatten()

	df <- tibble::tibble(
	cep = stringr::str_extract(
	txt,
	pattern = regex(paste0(regex_cep))
	),
	cnpj = stringr::str_extract(
	txt,
	pattern = regex(paste0(regex_cnpj))
	)
	)

	return(df)
	}

	# request ----

	urls <- c("url.com/x.pdf", "url.com/y.pdf")

	df <- purrr::map_dfr(
	urls,
	~ get_from_pdf(.x)
	)
No results found