Shixiang Wang

>上士闻道
勤而行之

字符串操作

王诗翔 · 2018-08-15

分类: r  
标签: r   stringr  

打印

"Hello world!"
#> [1] "Hello world!"
print("Hello world!")
#> [1] "Hello world!"

不带索引

cat("Hello world!")
#> Hello world!
message("Hello world!")
#> Hello world!

连接

paste("Hello", "world!")
#> [1] "Hello world!"
paste("Hello", "world", sep = "-")
#> [1] "Hello-world"
paste0("Hello", "World")
#> [1] "HelloWorld"
paste(c("A", "B"), c(1,2))
#> [1] "A 1" "B 2"
paste(c("A", "B"), c(1,2), collapse = ",", sep = "")
#> [1] "A1,B2"

转换

转换大小写

toupper("Hello")
#> [1] "HELLO"
tolower("HeLLo")
#> [1] "hello"

字符计数

nchar("Hello!")
#> [1] 6

清除首末空白

trimws(c(" P ", " S", "P "))
#> [1] "P" "S" "P"

取子串

dates = c("Jan 3", "Feb 10", "Nov 15")
substr(dates, 1, 3)
#> [1] "Jan" "Feb" "Nov"
substr(dates, 5, nchar(dates))
#> [1] "3"  "10" "15"
substring(dates, 5)
#> [1] "3"  "10" "15"

切分

strsplit("a,bb,ccc", split=",")
#> [[1]]
#> [1] "a"   "bb"  "ccc"
students = strsplit(c("Tony, 26, Physics", "James, 25, Economics"), split = ",")
students
#> [[1]]
#> [1] "Tony"     " 26"      " Physics"
#> 
#> [[2]]
#> [1] "James"      " 25"        " Economics"
students_matrix = do.call(rbind, students)
students_matrix
#>      [,1]    [,2]  [,3]        
#> [1,] "Tony"  " 26" " Physics"  
#> [2,] "James" " 25" " Economics"

切空字符

strsplit(c("hello", "world"), split = "")
#> [[1]]
#> [1] "h" "e" "l" "l" "o"
#> 
#> [[2]]
#> [1] "w" "o" "r" "l" "d"

格式化日期/时间

返回当前日期和时间:

Sys.Date()
#> [1] "2020-08-09"
Sys.time()
#> [1] "2020-08-09 12:20:52 CST"

解析为日期与时间

as.Date(1000, "1970-01-01")
#> [1] "1972-09-27"
my_date = as.Date("2018-08-15")
my_date
#> [1] "2018-08-15"
my_date + 3
#> [1] "2018-08-18"
my_date - 2
#> [1] "2018-08-13"
my_date - as.Date("2015-12-20")
#> Time difference of 969 days
my_time = as.POSIXct("2018-08-15 22:11:20")
my_time
#> [1] "2018-08-15 22:11:20 CST"
# 改变的是秒
my_time + 10
#> [1] "2018-08-15 22:11:30 CST"
as.Date("2015.04.02", format = "%Y.%m.%d")
#> [1] "2015-04-02"
strptime("7/25/2018 08:04:23", "%m/%d/%Y %H:%M:%S")
#> [1] "2018-07-25 08:04:23 CST"

日期到字符串

my_date
#> [1] "2018-08-15"
as.character(my_date, format = "%Y.%m.%d")
#> [1] "2018.08.15"

其实调用format()本身就可以做到

format(my_date, format = "%Y.%m.%d")
#> [1] "2018.08.15"

字符串匹配

library(stringr)
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
  "387 287 6718", "apple", "233.398.9187  ", "482 952 3315",
  "239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
  "Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
str_extract(strings, phone)
#>  [1] "219 733 8965" "329-293-8753" NA             "595 794 7569" "387 287 6718"
#>  [6] NA             "233.398.9187" "482 952 3315" "239 923 8115" "579-499-7527"
#> [11] NA             "543.355.3679"

()用于对数据进行标记。

str_match(strings, phone)
#>       [,1]           [,2]  [,3]  [,4]  
#>  [1,] "219 733 8965" "219" "733" "8965"
#>  [2,] "329-293-8753" "329" "293" "8753"
#>  [3,] NA             NA    NA    NA    
#>  [4,] "595 794 7569" "595" "794" "7569"
#>  [5,] "387 287 6718" "387" "287" "6718"
#>  [6,] NA             NA    NA    NA    
#>  [7,] "233.398.9187" "233" "398" "9187"
#>  [8,] "482 952 3315" "482" "952" "3315"
#>  [9,] "239 923 8115" "239" "923" "8115"
#> [10,] "579-499-7527" "579" "499" "7527"
#> [11,] NA             NA    NA    NA    
#> [12,] "543.355.3679" "543" "355" "3679"
# Extract/match all
str_extract_all(strings, phone)
#> [[1]]
#> [1] "219 733 8965"
#> 
#> [[2]]
#> [1] "329-293-8753"
#> 
#> [[3]]
#> character(0)
#> 
#> [[4]]
#> [1] "595 794 7569"
#> 
#> [[5]]
#> [1] "387 287 6718"
#> 
#> [[6]]
#> character(0)
#> 
#> [[7]]
#> [1] "233.398.9187"
#> 
#> [[8]]
#> [1] "482 952 3315"
#> 
#> [[9]]
#> [1] "239 923 8115" "842 566 4692"
#> 
#> [[10]]
#> [1] "579-499-7527"
#> 
#> [[11]]
#> character(0)
#> 
#> [[12]]
#> [1] "543.355.3679"
str_match_all(strings, phone)
#> [[1]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "219 733 8965" "219" "733" "8965"
#> 
#> [[2]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "329-293-8753" "329" "293" "8753"
#> 
#> [[3]]
#>      [,1] [,2] [,3] [,4]
#> 
#> [[4]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "595 794 7569" "595" "794" "7569"
#> 
#> [[5]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "387 287 6718" "387" "287" "6718"
#> 
#> [[6]]
#>      [,1] [,2] [,3] [,4]
#> 
#> [[7]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "233.398.9187" "233" "398" "9187"
#> 
#> [[8]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "482 952 3315" "482" "952" "3315"
#> 
#> [[9]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "239 923 8115" "239" "923" "8115"
#> [2,] "842 566 4692" "842" "566" "4692"
#> 
#> [[10]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "579-499-7527" "579" "499" "7527"
#> 
#> [[11]]
#>      [,1] [,2] [,3] [,4]
#> 
#> [[12]]
#>      [,1]           [,2]  [,3]  [,4]  
#> [1,] "543.355.3679" "543" "355" "3679"