AlbertRapp · February 18, 2023 14:43
diff --git a/cleaning_company_labels.qmd b/cleaning_company_labels.qmd

 ## Renaming long names

 Load `tidyverse` and data from TidyTuesday.

 ```{r}
 library(tidyverse)
 big_tech_companies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_companies.csv')
 big_tech_companies
 # # A tibble: 14 × 2
 #    stock_symbol company                                  
 #    <chr>        <chr>                                    
 #  1 AAPL         Apple Inc.                               
 #  2 ADBE         Adobe Inc.                               
 #  3 AMZN         Amazon.com, Inc.                         
 #  4 CRM          Salesforce, Inc.                         
 #  5 CSCO         Cisco Systems, Inc.                      
 #  6 GOOGL        Alphabet Inc.                            
 #  7 IBM          International Business Machines Corporat…
 #  8 INTC         Intel Corporation                        
 #  9 META         Meta Platforms, Inc.                     
 # 10 MSFT         Microsoft Corporation                    
 # 11 NFLX         Netflix, Inc.                            
 # 12 NVDA         NVIDIA Corporation                       
 # 13 ORCL         Oracle Corporation                       
 # 14 TSLA         Tesla, Inc.  
 ```

 First, notice that IBM has a suuuuper long name.
 We can rename just that name in the `company` column with `mutate()` and `if_else()`.


 ```{r}
 big_tech_companies |> 
  mutate(
    company = if_else(
      stock_symbol == 'IBM',
      'IBM',
      company
    )
  )
 # # A tibble: 14 × 2
 #    stock_symbol company              
 #    <chr>        <chr>                
 #  1 AAPL         Apple Inc.           
 #  2 ADBE         Adobe Inc.           
 #  3 AMZN         Amazon.com, Inc.     
 #  4 CRM          Salesforce, Inc.     
 #  5 CSCO         Cisco Systems, Inc.  
 #  6 GOOGL        Alphabet Inc.        
 #  7 IBM          IBM                  
 #  8 INTC         Intel Corporation    
 #  9 META         Meta Platforms, Inc. 
 # 10 MSFT         Microsoft Corporation
 # 11 NFLX         Netflix, Inc.        
 # 12 NVDA         NVIDIA Corporation   
 # 13 ORCL         Oracle Corporation   
 # 14 TSLA         Tesla, Inc.        
 ```

 ## Remove superfluous words

 Now, let's get rid of words like "Platforms" or "Corporation".
 Similarly, we can get rid of commas.
 We can do all that that with `str_remove()`.

 ```{r}
 big_tech_companies |> 
  mutate(
    company = if_else(
      stock_symbol == 'IBM',
      'IBM',
      company
    ),
    company = str_remove(
      company,
      # Put multiple things that need to be removed into parantheses and keep words apart with |
      '(Platforms|Corporation|,)'
    )
  )
 # # A tibble: 14 × 2
 #    stock_symbol company             
 #    <chr>        <chr>               
 #  1 AAPL         "Apple Inc."        
 #  2 ADBE         "Adobe Inc."        
 #  3 AMZN         "Amazon.com Inc."   
 #  4 CRM          "Salesforce Inc."   
 #  5 CSCO         "Cisco Systems Inc."
 #  6 GOOGL        "Alphabet Inc."     
 #  7 IBM          "IBM"               
 #  8 INTC         "Intel "            
 #  9 META         "Meta , Inc."       
 # 10 MSFT         "Microsoft "        
 # 11 NFLX         "Netflix Inc."      
 # 12 NVDA         "NVIDIA "           
 # 13 ORCL         "Oracle "           
 # 14 TSLA         "Tesla Inc."  
 ```

 Notice how this did not remove everything we wanted?
 That's a good use case for `str_remove_all()`.
 You want to get rid of all possible matches.

 ```{r}
 big_tech_companies |> 
  mutate(
    company = if_else(
      stock_symbol == 'IBM',
      'IBM',
      company
    ),
    company = str_remove_all(
      company,
      '(Platforms|Corporation|,)'
    )
  )
 # # A tibble: 14 × 2
 #    stock_symbol company             
 #    <chr>        <chr>               
 #  1 AAPL         "Apple Inc."        
 #  2 ADBE         "Adobe Inc."        
 #  3 AMZN         "Amazon.com Inc."   
 #  4 CRM          "Salesforce Inc."   
 #  5 CSCO         "Cisco Systems Inc."
 #  6 GOOGL        "Alphabet Inc."     
 #  7 IBM          "IBM"               
 #  8 INTC         "Intel "            
 #  9 META         "Meta  Inc."        
 # 10 MSFT         "Microsoft "        
 # 11 NFLX         "Netflix Inc."      
 # 12 NVDA         "NVIDIA "           
 # 13 ORCL         "Oracle "           
 # 14 TSLA         "Tesla Inc." 
 ```



 ## Remove words with special characters


 ```{r}
 big_tech_companies |> 
  mutate(
    company = if_else(
      stock_symbol == 'IBM',
      'IBM',
      company
    ),
    company = str_remove_all(
      company,
      '(Platforms|Corporation|Inc\\.|\\.com|,)'
    )
  )
 # # A tibble: 14 × 2
 #    stock_symbol company         
 #    <chr>        <chr>           
 #  1 AAPL         "Apple "        
 #  2 ADBE         "Adobe "        
 #  3 AMZN         "Amazon "       
 #  4 CRM          "Salesforce "   
 #  5 CSCO         "Cisco Systems "
 #  6 GOOGL        "Alphabet "     
 #  7 IBM          "IBM"           
 #  8 INTC         "Intel "        
 #  9 META         "Meta  "        
 # 10 MSFT         "Microsoft "    
 # 11 NFLX         "Netflix "      
 # 12 NVDA         "NVIDIA "       
 # 13 ORCL         "Oracle "       
 # 14 TSLA         "Tesla " 
 ```


 ## Remove trailing white space

 Here, `str_trim()` will help you.

 ```{r}
 big_tech_companies |> 
  mutate(
    company = if_else(
      stock_symbol == 'IBM',
      'IBM',
      company
    ),
    company = str_remove_all(
      company,
      '(Platforms|Corporation|Inc\\.|\\.com|,)'
    ),
    company = str_trim(company)
  )
 # # A tibble: 14 × 2
 #    stock_symbol company      
 #    <chr>        <chr>        
 #  1 AAPL         Apple        
 #  2 ADBE         Adobe        
 #  3 AMZN         Amazon       
 #  4 CRM          Salesforce   
 #  5 CSCO         Cisco Systems
 #  6 GOOGL        Alphabet     
 #  7 IBM          IBM          
 #  8 INTC         Intel        
 #  9 META         Meta         
 # 10 MSFT         Microsoft    
 # 11 NFLX         Netflix      
 # 12 NVDA         NVIDIA       
 # 13 ORCL         Oracle       
 # 14 TSLA         Tesla     
 ```

	## Renaming long names

	Load `tidyverse` and data from TidyTuesday.

	```{r}
	library(tidyverse)
	big_tech_companies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-07/big_tech_companies.csv')
	big_tech_companies
	# # A tibble: 14 × 2
	# stock_symbol company
	# <chr> <chr>
	# 1 AAPL Apple Inc.
	# 2 ADBE Adobe Inc.
	# 3 AMZN Amazon.com, Inc.
	# 4 CRM Salesforce, Inc.
	# 5 CSCO Cisco Systems, Inc.
	# 6 GOOGL Alphabet Inc.
	# 7 IBM International Business Machines Corporat…
	# 8 INTC Intel Corporation
	# 9 META Meta Platforms, Inc.
	# 10 MSFT Microsoft Corporation
	# 11 NFLX Netflix, Inc.
	# 12 NVDA NVIDIA Corporation
	# 13 ORCL Oracle Corporation
	# 14 TSLA Tesla, Inc.
	```

	First, notice that IBM has a suuuuper long name.
	We can rename just that name in the `company` column with `mutate()` and `if_else()`.


	```{r}
	big_tech_companies \|>
	mutate(
	company = if_else(
	stock_symbol == 'IBM',
	'IBM',
	company
	)
	)
	# # A tibble: 14 × 2
	# stock_symbol company
	# <chr> <chr>
	# 1 AAPL Apple Inc.
	# 2 ADBE Adobe Inc.
	# 3 AMZN Amazon.com, Inc.
	# 4 CRM Salesforce, Inc.
	# 5 CSCO Cisco Systems, Inc.
	# 6 GOOGL Alphabet Inc.
	# 7 IBM IBM
	# 8 INTC Intel Corporation
	# 9 META Meta Platforms, Inc.
	# 10 MSFT Microsoft Corporation
	# 11 NFLX Netflix, Inc.
	# 12 NVDA NVIDIA Corporation
	# 13 ORCL Oracle Corporation
	# 14 TSLA Tesla, Inc.
	```

	## Remove superfluous words

	Now, let's get rid of words like "Platforms" or "Corporation".
	Similarly, we can get rid of commas.
	We can do all that that with `str_remove()`.

	```{r}
	big_tech_companies \|>
	mutate(
	company = if_else(
	stock_symbol == 'IBM',
	'IBM',
	company
	),
	company = str_remove(
	company,
	# Put multiple things that need to be removed into parantheses and keep words apart with \|
	'(Platforms\|Corporation\|,)'
	)
	)
	# # A tibble: 14 × 2
	# stock_symbol company
	# <chr> <chr>
	# 1 AAPL "Apple Inc."
	# 2 ADBE "Adobe Inc."
	# 3 AMZN "Amazon.com Inc."
	# 4 CRM "Salesforce Inc."
	# 5 CSCO "Cisco Systems Inc."
	# 6 GOOGL "Alphabet Inc."
	# 7 IBM "IBM"
	# 8 INTC "Intel "
	# 9 META "Meta , Inc."
	# 10 MSFT "Microsoft "
	# 11 NFLX "Netflix Inc."
	# 12 NVDA "NVIDIA "
	# 13 ORCL "Oracle "
	# 14 TSLA "Tesla Inc."
	```

	Notice how this did not remove everything we wanted?
	That's a good use case for `str_remove_all()`.
	You want to get rid of all possible matches.

	```{r}
	big_tech_companies \|>
	mutate(
	company = if_else(
	stock_symbol == 'IBM',
	'IBM',
	company
	),
	company = str_remove_all(
	company,
	'(Platforms\|Corporation\|,)'
	)
	)
	# # A tibble: 14 × 2
	# stock_symbol company
	# <chr> <chr>
	# 1 AAPL "Apple Inc."
	# 2 ADBE "Adobe Inc."
	# 3 AMZN "Amazon.com Inc."
	# 4 CRM "Salesforce Inc."
	# 5 CSCO "Cisco Systems Inc."
	# 6 GOOGL "Alphabet Inc."
	# 7 IBM "IBM"
	# 8 INTC "Intel "
	# 9 META "Meta Inc."
	# 10 MSFT "Microsoft "
	# 11 NFLX "Netflix Inc."
	# 12 NVDA "NVIDIA "
	# 13 ORCL "Oracle "
	# 14 TSLA "Tesla Inc."
	```



	## Remove words with special characters


	```{r}
	big_tech_companies \|>
	mutate(
	company = if_else(
	stock_symbol == 'IBM',
	'IBM',
	company
	),
	company = str_remove_all(
	company,
	'(Platforms\|Corporation\|Inc\\.\|\\.com\|,)'
	)
	)
	# # A tibble: 14 × 2
	# stock_symbol company
	# <chr> <chr>
	# 1 AAPL "Apple "
	# 2 ADBE "Adobe "
	# 3 AMZN "Amazon "
	# 4 CRM "Salesforce "
	# 5 CSCO "Cisco Systems "
	# 6 GOOGL "Alphabet "
	# 7 IBM "IBM"
	# 8 INTC "Intel "
	# 9 META "Meta "
	# 10 MSFT "Microsoft "
	# 11 NFLX "Netflix "
	# 12 NVDA "NVIDIA "
	# 13 ORCL "Oracle "
	# 14 TSLA "Tesla "
	```


	## Remove trailing white space

	Here, `str_trim()` will help you.

	```{r}
	big_tech_companies \|>
	mutate(
	company = if_else(
	stock_symbol == 'IBM',
	'IBM',
	company
	),
	company = str_remove_all(
	company,
	'(Platforms\|Corporation\|Inc\\.\|\\.com\|,)'
	),
	company = str_trim(company)
	)
	# # A tibble: 14 × 2
	# stock_symbol company
	# <chr> <chr>
	# 1 AAPL Apple
	# 2 ADBE Adobe
	# 3 AMZN Amazon
	# 4 CRM Salesforce
	# 5 CSCO Cisco Systems
	# 6 GOOGL Alphabet
	# 7 IBM IBM
	# 8 INTC Intel
	# 9 META Meta
	# 10 MSFT Microsoft
	# 11 NFLX Netflix
	# 12 NVDA NVIDIA
	# 13 ORCL Oracle
	# 14 TSLA Tesla
	```