针对你的data.table数据清洗需求,这里提供一个系统性的解决方案流程:
## 一、整体思路框架
```r
library(data.table)
# 1. 数据加载与列选择
dt <- fread("your_data.csv", select = c("col1", "col2", "col3", ...))
# 2. 创建清洗脚本模块化结构
# 建议按列或按清洗类型组织代码
```
## 二、分步清洗策略
### 步骤1:制定清洗计划表
```r
# 在开始编码前,先创建清洗规则表(可用Excel或列表记录)
cleaning_rules <- list(
col1 = list(
type = "text_to_NA",
patterns = c("unknown", "N/A", "missing", ""),
method = "direct_na"
),
col2 = list(
type = "text_to_level",
mapping = c("低" = 1, "中" = 2, "高" = 3),
default = NA
),
col3 = list(
type = "numeric_operation",
operation = "divide",
divisor = 100,
na_if_zero = TRUE
)
)
```
### 步骤2:编写可复用的清洗函数
```r
# 函数1:文本转NA
clean_text_to_na <- function(x, patterns) {
x[grepl(paste(patterns, collapse = "|"), x, ignore.case = TRUE)] <- NA_character_
return(x)
}
# 函数2:文本转等级
clean_text_to_level <- function(x, mapping) {
result <- rep(NA_integer_, length(x))
for (text in names(mapping)) {
result[x == text] <- mapping[[text]]
}
return(result)
}
# 函数3:数值运算
clean_numeric_operation <- function(x, operation, divisor = NULL) {
x <- as.numeric(x)
if (operation == "divide" && !is.null(divisor)) {
x <- x / divisor
}
return(x)
}
```
### 步骤3:应用清洗规则
```r
# 方法1:逐个列应用规则
dt[, col1 := clean_text_to_na(col1, patterns = c("unknown", "N/A"))]
dt[, col2 := clean_text_to_level(col2, mapping = c("低"=1, "中"=2, "高"=3))]
dt[, col3 := clean_numeric_operation(col3, operation = "divide", divisor = 100)]
# 方法2:批量应用(如果规则复杂但统一)
clean_column <- function(dt, col_name, rule) {
if (rule$type == "text_to_NA") {
dt[, (col_name) := clean_text_to_na(get(col_name), rule$patterns)]
} else if (rule$type == "text_to_level") {
dt[, (col_name) := clean_text_to_level(get(col_name), rule$mapping)]
}
# ... 其他规则
return(dt)
}
```
## 三、高效处理建议
### 1. 使用data.table的 `.SD` 和 `lapply`
```r
# 批量处理多个文本列
text_cols <- c("col1", "col2", "col3")
patterns_to_na <- c("unknown", "N/A", "missing")
dt[, (text_cols) := lapply(.SD, clean_text_to_na, patterns = patterns_to_na),
.SDcols = text_cols]
```
### 2. 使用`fcase`进行多条件替换
```r
# 对文本内容赋值等级
dt[, level := fcase(
col_text %chin% c("优秀", "excellent"), 1,
col_text %chin% c("良好", "good"), 2,
col_text %chin% c("一般", "average"), 3,
default = NA_integer_
)]
```
### 3. 处理特定数字内容
```r
# 除以n并处理异常值
dt[, col_numeric := fifelse(
col_numeric > 0,
col_numeric / 100,
NA_real_
)]
# 或者使用data.table的 := 与 ifelse结合
dt[col_numeric > 1000, col_numeric := col_numeric / 1000]
```
## 四、工作流程建议
1. **先探索后清洗**:
```r
# 先查看数据概况
str(dt)
summary(dt)
# 查看唯一值分布
dt[, lapply(.SD, function(x) list(unique_values = unique(x)))]
```
2. **分步验证**:
```r
# 创建副本用于验证
dt_original <- copy(dt)
dt_clean <- copy(dt)
# 清洗后对比
compare_df <- data.table(
column = names(dt),
original_na = sapply(dt_original, function(x) sum(is.na(x))),
cleaned_na = sapply(dt_clean, function(x) sum(is.na(x))),
change = sapply(1:ncol(dt), function(i)
sum(dt_original[[i]] != dt_clean[[i]], na.rm = TRUE))
)
```
3. **创建清洗日志**:
```r
# 记录清洗操作
cleaning_log <- list(
timestamp = Sys.time(),
operations = list(
list(column = "col1", operation = "text_to_NA", patterns = c("unknown")),
list(column = "col2", operation = "text_to_level", levels = 3)
),
rows_affected = nrow(dt)
)
```
## 五、完整示例模板
```r
library(data.table)
# 1. 加载数据
dt <- fread("data.csv", select = c("id", "score_text", "value", "category"))
# 2. 定义清洗规则
rules <- list(
score_text = list(
action = "categorize",
mapping = list(
"优秀" = "A",
"良好" = "B",
"及格" = "C",
c("不及格", "失败") = "D"
)
),
value = list(
action = "scale",
operation = "/",
factor = 1000
),
category = list(
action = "replace_na",
replacement = "未知"
)
)
# 3. 应用清洗
dt[, score_category := fcase(
score_text %chin% c("优秀", "excellent"), "A",
score_text %chin% c("良好", "good"), "B",
default = "C"
)]
dt[, value_scaled := value / 1000]
dt[is.na(category), category := "未知"]
# 4. 保存结果
fwrite(dt, "cleaned_data.csv")
```
## 六、实用技巧
1. **使用临时列**:避免覆盖原始数据
```r
dt[, temp_col := your_operation(col)]
# 验证无误后再替换
dt[, col := temp_col][, temp_col := NULL]
```
2. **分块处理大文件**:
```r
chunk_size <- 1e6
con <- file("big_data.csv", "r")
while(nrow(chunk <- fread(con, nrows = chunk_size)) > 0) {
# 应用清洗
process_chunk(chunk)
}
```
3. **使用`set`函数提高效率**(大数据时):
```r
for(i in 1:nrow(dt)) {
if(dt[i, score] == "unknown") {
set(dt, i, "score", NA_character_)
}
}
```
这样的模块化方法能让复杂清洗变得可控。建议先从单个列开始,编写并测试清洗函数,确认无误后再扩展到其他列。