/php/php-2darray-filter
This commit is contained in:
parent
916f3a4405
commit
d8a084c69b
@ -51,7 +51,8 @@ export default defineConfig({
|
||||
{ text: '基于PHP8.x制作workman环境镜像', link: '/php/docker-php8' },
|
||||
{ text: '源码编译安装PHP8.2环境', link: '/php/php82' },
|
||||
{ text: '为PHP7.4安装pgsql和pdo_pgsql扩展', link: '/php/php74-pgsql' },
|
||||
{ text: 'PHP的hash()函数中不同algo算法的性能对比', link: '/php/hash-algo' }
|
||||
{ text: 'PHP的hash()函数中不同algo算法的性能对比', link: '/php/hash-algo' },
|
||||
{ text: 'PHP对二维数组做去重并取得重复项(10万+记录)', link: '/php/php-2darray-filter' }
|
||||
]
|
||||
},
|
||||
{
|
||||
|
367
docs/src/php/php-2darray-filter.md
Normal file
367
docs/src/php/php-2darray-filter.md
Normal file
@ -0,0 +1,367 @@
|
||||
# PHP对二维数组做去重并取得重复项(10万行记录,array_filter太慢,array_map配合array_diff速度最快)
|
||||
|
||||
## 需求背景
|
||||
|
||||
长度大约10万级别的二维数组,元素内数组长度10个左右(其实就是一个数据表的结果集合),根据指定字段对数据进行去重,最后要得到去重后 **被丢弃的** 数据明细。
|
||||
|
||||
> 典型应用场景就是Excel表格导入数据库前,对表格中数据先做一次去重预处理并把原数据、处理后数据、丢弃的数据都展示给用户,待用户确认后再执行数据导入数据库。
|
||||
|
||||
## 用array_map和array_diff实现
|
||||
|
||||
### 过程1 - 根据指定字段对数组内元素进行去重
|
||||
|
||||
```php
|
||||
function arrayUniqueByKey(array $arr, string $key): array
|
||||
{
|
||||
$arr = array_reverse($arr); // 数组倒置保留重复的第1个元素,如果不倒置就是保留重复的最后1个元素
|
||||
if (isset($arr[0][$key])) {
|
||||
foreach ($arr as $v) {
|
||||
$newArr[$v[$key]] = $v;
|
||||
}
|
||||
}
|
||||
$newArr = isset($newArr) ? array_values($newArr) : [];
|
||||
$newArr = array_reverse($newArr); // 前面倒置了,后面就再给倒回去
|
||||
return $newArr;
|
||||
}
|
||||
```
|
||||
|
||||
> 遍历1次数组是必须的,遍历过程中只是拿关键字段的值当作键名去创建一个新数组,这样当关键字段重复时,赋值就相当于覆盖,进而实现去重。
|
||||
|
||||
### 过程2 - 求原数组和被去重后数组的差集
|
||||
|
||||
```php
|
||||
function arrayDiff(array $arr1, array $arr2): array
|
||||
{
|
||||
$arr1 = array_map(function ($v) {
|
||||
return json_encode($v);
|
||||
}, $arr1);
|
||||
$arr2 = array_map(function ($v) {
|
||||
return json_encode($v);
|
||||
}, $arr2);
|
||||
$diff = array_diff($arr1, $arr2);
|
||||
$diff = array_map(function ($v) {
|
||||
return json_decode($v, true);
|
||||
}, $diff);
|
||||
return array_values($diff);
|
||||
}
|
||||
```
|
||||
|
||||
> 先把两个多维数组转成一维数组,再用array_diff()求得一维数组差集,最后把一维差集再转回多维。
|
||||
|
||||
### 完整测试代码
|
||||
|
||||
```php
|
||||
<?php
|
||||
|
||||
ini_set('memory_limit', '1G');
|
||||
|
||||
/**
|
||||
* 得到系统当前毫秒级时间的各种表示
|
||||
*
|
||||
* @author Aaron <chenqiang@h024.cn>
|
||||
*/
|
||||
function getSystemTimes()
|
||||
{
|
||||
list($usec, $sec) = explode(" ", microtime());
|
||||
$microtime = ((float)$usec + (float)$sec);
|
||||
$time = (int)floor($microtime);
|
||||
$millsecond = round($microtime - $time, 3) * 1000;
|
||||
$millsecond = str_pad($millsecond, 3, '0', STR_PAD_LEFT);
|
||||
$weekarray = array("日", "一", "二", "三", "四", "五", "六");
|
||||
$data['year'] = date('Y', $time);
|
||||
$data['month'] = date('m', $time);
|
||||
$data['day'] = date('d', $time);
|
||||
$data['hour'] = date('H', $time);
|
||||
$data['hour_pre'] = date('A', $time);
|
||||
$data['hour_12'] = date('h', $time);
|
||||
$data['minute'] = date('i', $time);
|
||||
$data['second'] = date('s', $time);
|
||||
$data['millisecond'] = $millsecond;
|
||||
$data['date'] = date('Y-m-d', $time);
|
||||
$data['time'] = date('H:i:s', $time);
|
||||
$data['weekday'] = "星期{$weekarray[date('w',$time)]}";
|
||||
$data['weekday_index'] = date('w', $time);
|
||||
$data['time_milli'] = "{$data['time']}.{$millsecond}";
|
||||
$data['datetime'] = date('Y-m-d H:i:s', $time);
|
||||
$data['datetime_milli'] = "{$data['datetime']}.{$millsecond}";
|
||||
$data['timestamp'] = $time;
|
||||
$data['timestamp_micro'] = $microtime;
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* 二维数组针对指定键名去重
|
||||
*
|
||||
* @author Aaron <chenqiang@h024.cn>
|
||||
*
|
||||
* @param array $arr
|
||||
* @param string $key
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
function arrayUniqueByKey(array $arr, string $key): array
|
||||
{
|
||||
$arr = array_reverse($arr); // 数组倒置保留重复的第1个元素,如果不倒置就是保留重复的最后1个元素
|
||||
if (isset($arr[0][$key])) {
|
||||
foreach ($arr as $v) {
|
||||
$newArr[$v[$key]] = $v;
|
||||
}
|
||||
}
|
||||
$newArr = isset($newArr) ? array_values($newArr) : [];
|
||||
$newArr = array_reverse($newArr); // 前面倒置了,后面就再给倒回去
|
||||
return $newArr;
|
||||
}
|
||||
|
||||
/**
|
||||
* 求两个多维数组的差集(两个参数别弄反了)
|
||||
*
|
||||
* @author Aaron <chenqiang@h024.cn>
|
||||
*
|
||||
* @param array $arr1
|
||||
* @param array $arr2
|
||||
*
|
||||
* @return array 返回$arr1-$arr2的结果
|
||||
*/
|
||||
function arrayDiff(array $arr1, array $arr2): array
|
||||
{
|
||||
$arr1 = array_map(function ($v) {
|
||||
return json_encode($v);
|
||||
}, $arr1);
|
||||
$arr2 = array_map(function ($v) {
|
||||
return json_encode($v);
|
||||
}, $arr2);
|
||||
$diff = array_diff($arr1, $arr2);
|
||||
$diff = array_map(function ($v) {
|
||||
return json_decode($v, true);
|
||||
}, $diff);
|
||||
return array_values($diff);
|
||||
}
|
||||
|
||||
$arr = [];
|
||||
$total = 999999 + 1;
|
||||
$stamp0 = getSystemTimes();
|
||||
echo "\n数据量: {$total}";
|
||||
echo "\n生成数组开始: " . $stamp0['datetime_milli'];
|
||||
for ($i = 0; $i < $total; $i++) {
|
||||
$index = $i;
|
||||
if ($i % 101 == 0) {
|
||||
$index = $i - 1; // 做几个重复的
|
||||
}
|
||||
$suffix = str_pad($index, 8, '0', STR_PAD_LEFT);
|
||||
if ($index % 113 == 0) {
|
||||
$suffix = str_pad($index - 1, 8, '0', STR_PAD_LEFT);
|
||||
}
|
||||
$arr[] = [
|
||||
'name' => '优优' . $index,
|
||||
'phone' => "133{$suffix}",
|
||||
'var1' => $index,
|
||||
'var2' => '',
|
||||
'var3' => '',
|
||||
'var4' => '',
|
||||
'var5' => $index,
|
||||
];
|
||||
}
|
||||
$stamp1 = getSystemTimes();
|
||||
echo "\n生成数组完成: " . $stamp1['datetime_milli'] . ';耗时:' . $stamp1['timestamp_micro']-$stamp0['timestamp_micro'] . 's';
|
||||
echo "\n取去重后数组开始: " . $stamp1['datetime_milli'];
|
||||
$arr2 = arrayUniqueByKey($arr, 'phone');
|
||||
$stamp2 = getSystemTimes();
|
||||
echo "\n取去重后数组完成: " . $stamp2['datetime_milli'] . ';耗时:' . $stamp2['timestamp_micro']-$stamp1['timestamp_micro'] . 's';
|
||||
echo "\n去重求差开始: " . $stamp2['datetime_milli'];
|
||||
$repeated = arrayDiff($arr, $arr2);
|
||||
$stamp3 = getSystemTimes();
|
||||
echo "\n去重求差完成: " . $stamp3['datetime_milli'] . ';耗时:' . $stamp3['timestamp_micro']-$stamp2['timestamp_micro'] . 's';
|
||||
echo "\n重复记录数量: " . count($repeated);
|
||||
echo "\n计算总耗时: " . $stamp3['timestamp_micro'] - $stamp1['timestamp_micro'] . 's';
|
||||
echo "\n";
|
||||
```
|
||||
|
||||
结果
|
||||
|
||||
```txt
|
||||
数据量: 1000000
|
||||
生成数组开始: 2024-11-08 09:06:35.930
|
||||
生成数组完成: 2024-11-08 09:06:36.467;耗时:0.53709101676941s
|
||||
取去重后数组开始: 2024-11-08 09:06:36.467
|
||||
取去重后数组完成: 2024-11-08 09:06:38.566;耗时:2.099249124527s
|
||||
去重求差开始: 2024-11-08 09:06:38.566
|
||||
去重求差完成: 2024-11-08 09:06:40.383;耗时:1.8165800571442s
|
||||
重复记录数量: 8762
|
||||
计算总耗时: 3.9158291816711s
|
||||
```
|
||||
|
||||
从结果可以看到,数据量是1000000,生成模拟数据大概花了0.5s,去重花的时间大概是2.1s,求差集花的时间大概是1.8s,去重+求差大约在4秒完成,这个时效对于这个数据量来说还是可以接受的。
|
||||
|
||||
### 结论
|
||||
|
||||
原生数组操作函数的性能一般都是很高的,能用原生就尽量用原生。
|
||||
|
||||
用array_map加工数组元素的时间效率很高;
|
||||
|
||||
用array_diff求数组差集的时间效率很高;
|
||||
|
||||
用array_reverse得到倒置数组的时间效率很高;
|
||||
|
||||
## 用array_filter实现
|
||||
|
||||
### 核心代码
|
||||
|
||||
```php
|
||||
$r = array_filter($arr1, function ($v) use ($arr2) {
|
||||
return !in_array($v, $arr2);
|
||||
});
|
||||
```
|
||||
|
||||
是的,就是这么几行,很简洁,下面我们就试试
|
||||
|
||||
### 完整测试代码
|
||||
|
||||
```php
|
||||
<?php
|
||||
|
||||
ini_set('memory_limit', '1G');
|
||||
|
||||
/**
|
||||
* 得到系统当前毫秒级时间的各种表示
|
||||
*
|
||||
* @author Aaron <chenqiang@h024.cn>
|
||||
*/
|
||||
function getSystemTimes()
|
||||
{
|
||||
list($usec, $sec) = explode(" ", microtime());
|
||||
$microtime = ((float)$usec + (float)$sec);
|
||||
$time = (int)floor($microtime);
|
||||
$millsecond = round($microtime - $time, 3) * 1000;
|
||||
$millsecond = str_pad($millsecond, 3, '0', STR_PAD_LEFT);
|
||||
$weekarray = array("日", "一", "二", "三", "四", "五", "六");
|
||||
$data['year'] = date('Y', $time);
|
||||
$data['month'] = date('m', $time);
|
||||
$data['day'] = date('d', $time);
|
||||
$data['hour'] = date('H', $time);
|
||||
$data['hour_pre'] = date('A', $time);
|
||||
$data['hour_12'] = date('h', $time);
|
||||
$data['minute'] = date('i', $time);
|
||||
$data['second'] = date('s', $time);
|
||||
$data['millisecond'] = $millsecond;
|
||||
$data['date'] = date('Y-m-d', $time);
|
||||
$data['time'] = date('H:i:s', $time);
|
||||
$data['weekday'] = "星期{$weekarray[date('w',$time)]}";
|
||||
$data['weekday_index'] = date('w', $time);
|
||||
$data['time_milli'] = "{$data['time']}.{$millsecond}";
|
||||
$data['datetime'] = date('Y-m-d H:i:s', $time);
|
||||
$data['datetime_milli'] = "{$data['datetime']}.{$millsecond}";
|
||||
$data['timestamp'] = $time;
|
||||
$data['timestamp_micro'] = $microtime;
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* 用array_filter函数实现对二维数据根据指定键去重,得到去重后的数组
|
||||
*
|
||||
* @author Aaron Chen <qiang.c@wukezhenzhu.com>
|
||||
*
|
||||
* @param array $arr
|
||||
* @param string $uniqueKey
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
function arrayUniqueFilter(array $arr, string $uniqueKey)
|
||||
{
|
||||
$uniqueIds = [];
|
||||
// 得到去重后的记录
|
||||
$arr2 = array_filter($arr, function ($item) use (&$uniqueIds, $uniqueKey) {
|
||||
if (in_array($item[$uniqueKey], $uniqueIds)) {
|
||||
return false; // 已经在$uniqueIds数组中,返回false去除这个元素
|
||||
} else {
|
||||
$uniqueIds[] = $item[$uniqueKey]; // 否则,将键添加到$uniqueIds数组中
|
||||
return true; // 并返回true保留这个元素
|
||||
}
|
||||
});
|
||||
return $arr2;
|
||||
}
|
||||
|
||||
/**
|
||||
* 求两个多维数组的差集(两个参数别弄反了)
|
||||
*
|
||||
* @author Aaron <chenqiang@h024.cn>
|
||||
*
|
||||
* @param array $arr1
|
||||
* @param array $arr2
|
||||
*
|
||||
* @return array 返回$arr1-$arr2的结果
|
||||
*/
|
||||
function arrayDiff(array $arr1, array $arr2): array
|
||||
{
|
||||
$arr1 = array_map(function ($v) {
|
||||
return json_encode($v);
|
||||
}, $arr1);
|
||||
$arr2 = array_map(function ($v) {
|
||||
return json_encode($v);
|
||||
}, $arr2);
|
||||
$diff = array_diff($arr1, $arr2);
|
||||
$diff = array_map(function ($v) {
|
||||
return json_decode($v, true);
|
||||
}, $diff);
|
||||
return array_values($diff);
|
||||
}
|
||||
|
||||
$arr = [];
|
||||
$total = 99999 + 1;
|
||||
$stamp0 = getSystemTimes();
|
||||
echo "\n数据量: {$total}";
|
||||
echo "\n生成数组开始: " . $stamp0['datetime_milli'];
|
||||
for ($i = 0; $i < $total; $i++) {
|
||||
$index = $i;
|
||||
if ($i % 101 == 0) {
|
||||
$index = $i - 1; // 做几个重复的
|
||||
}
|
||||
$suffix = str_pad($index, 8, '0', STR_PAD_LEFT);
|
||||
if ($index % 113 == 0) {
|
||||
$suffix = str_pad($index - 1, 8, '0', STR_PAD_LEFT);
|
||||
}
|
||||
$arr[] = [
|
||||
'name' => '优优' . $index,
|
||||
'phone' => "133{$suffix}",
|
||||
'var1' => $index,
|
||||
'var2' => '',
|
||||
'var3' => '',
|
||||
'var4' => '',
|
||||
'var5' => $index,
|
||||
];
|
||||
}
|
||||
$stamp1 = getSystemTimes();
|
||||
echo "\n生成数组完成: " . $stamp1['datetime_milli'] . ';耗时:' . $stamp1['timestamp_micro'] - $stamp0['timestamp_micro'] . 's';
|
||||
echo "\n取去重后数组开始: " . $stamp1['datetime_milli'];
|
||||
$arr2 = arrayUniqueFilter($arr, 'phone');
|
||||
$stamp2 = getSystemTimes();
|
||||
echo "\n取去重后数组完成: " . $stamp2['datetime_milli'] . ';耗时:' . $stamp2['timestamp_micro'] - $stamp1['timestamp_micro'] . 's';
|
||||
echo "\n去重后数组长度: " . count($arr2);
|
||||
echo "\n去重求差开始: " . $stamp2['datetime_milli'];
|
||||
$repeated = arrayDiff($arr, $arr2);
|
||||
$stamp3 = getSystemTimes();
|
||||
echo "\n去重求差完成: " . $stamp3['datetime_milli'] . ';耗时:' . $stamp3['timestamp_micro'] - $stamp2['timestamp_micro'] . 's';
|
||||
echo "\n重复记录数量: " . count($repeated);
|
||||
echo "\n计算总耗时: " . $stamp3['timestamp_micro'] - $stamp1['timestamp_micro'] . 's';
|
||||
echo "\n";
|
||||
```
|
||||
|
||||
上面这段程序在阿里云2核2G主机上的运行结果如下:
|
||||
|
||||
```sh
|
||||
数据量: 100000
|
||||
生成数组开始: 2024-11-09 02:14:22.305
|
||||
生成数组完成: 2024-11-09 02:14:22.345;耗时:0.039474964141846s
|
||||
取去重后数组开始: 2024-11-09 02:14:22.345
|
||||
取去重后数组完成: 2024-11-09 02:18:30.495;耗时:248.1507999897s
|
||||
去重后数组长度: 98143
|
||||
去重求差开始: 2024-11-09 02:18:30.495
|
||||
去重求差完成: 2024-11-09 02:18:30.634;耗时:0.13901495933533s
|
||||
重复记录数量: 876
|
||||
计算总耗时: 248.28981494904s
|
||||
```
|
||||
|
||||
从上面的结果可以看到,用array_filter()方法来处理二维数据时,10万行记录竟然花了4分钟,算了,不能再继续研究了。
|
||||
|
||||
## 最终结论
|
||||
|
||||
如果要处理很大的数组时,在内存够大时可以尽量使用`array_map`来提高执行速度。
|
Loading…
Reference in New Issue
Block a user