SparkRDD所有算子操作，建议全部手敲一遍

2023年6月2日下午11:08 • 大数据 • 阅读 72

说明：

1、以下方法全部来自这个RDD.scala，可以自己看源码

2、使用$SPARK_HOME/bin/spark-shell运行代码

3、注释部分是运行结果

//org.apache.spark.rdd
//RDD.scala

// Transformations (return a new RDD)

1.1 map
Return a new RDD by applying a function to all elements of this RDD.

def mapU: ClassTag: RDD[U]

val a = sc.parallelize(1 to 9, 2)
a.collect
//res0: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9)
val b = a.map(x => x*2)
b.collect
//res1: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18)

1.2 flatMap
Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.

def flatMapU: ClassTag: RDD[U]

val a = sc.parallelize(1 to 9, 2)
val d = a.flatMap(15 to _*2)
d.collect
//res2: Array[Int] = Array(15, 16, 15, 16, 17, 18)

1.3 filter
Return a new RDD containing only the elements that satisfy a predicate.

def filter(f: T => Boolean): RDD[T]

val a = sc.parallelize(1 to 9, 2)
a.filter(_ > 5).collect
//res4: Array[Int] = Array(6, 7, 8, 9)

1.4 distinct
Return a new RDD containing the distinct elements in this RDD.

def distinct(): RDD[T]

val f = sc.makeRDD(Array(1,2,3,1,2,3))
f.distinct.collect
//res9: Array[Int] = Array(2, 1, 3)

1.5 repartition
Return a new RDD that has exactly numPartitions partitions.

def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T]

val a = sc.parallelize(1 to 9, 2)
a.glom.collect
//res10: Array[Array[Int]] = Array(Array(1, 2, 3, 4), Array(5, 6, 7, 8, 9))
a.repartition(4)
val b = a.repartition(3)
b.glom.collect
//res19: Array[Array[Int]] = Array(Array(3, 6, 9), Array(1, 4, 7), Array(2, 5, 8))

1.6 coalesce
Return a new RDD that is reduced into numPartitions partitions.

def coalesce(numPartitions: Int, shuffle: Boolean = false,
partitionCoalescer: Option[PartitionCoalescer] = Option.empty)
(implicit ord: Ordering[T] = null)
: RDD[T]
val a = sc.parallelize(1 to 9, 2)
a.glom.collect
//res10: Array[Array[Int]] = Array(Array(1, 2, 3, 4), Array(5, 6, 7, 8, 9))
val c = a.coalesce(3,true)
c.glom.collect
//res22: Array[Array[Int]] = Array(Array(3, 6, 9), Array(1, 4, 7), Array(2, 5, 8))

1.7 sample
Return a sampled subset of this RDD.

def sample(
withReplacement: Boolean,
fraction: Double,
seed: Long = Utils.random.nextLong): RDD[T]
val a = sc.parallelize(0 to 9, 2)
val b = a.sample(true, 0.1)
b.collect
//res27: Array[Int] = Array(4)

1.8 randomSplit
Randomly splits this RDD with the provided weights.

def randomSplit(
weights: Array[Double],
seed: Long = Utils.random.nextLong): Array[RDD[T]]

val i = sc.makeRDD(0 to 9, 3).randomSplit(Array(0.3, 0.2, 0.5))
scala> i(0).collect
//res15: Array[Int] = Array(2, 8)
scala> i(1).collect
//res16: Array[Int] = Array(0, 5, 7, 9)
scala> i(2).collect
//res17: Array[Int] = Array(1, 3, 4, 6)

1.9 takeSample
Return a fixed-size sampled subset of this RDD in an array
def takeSample(
withReplacement: Boolean,
num: Int,
seed: Long = Utils.random.nextLong): Array[T]

//放回取数
sc.makeRDD(0 to 9, 3).takeSample(true,3)
//res20: Array[Int] = Array(7, 7, 6)
//不放回取数
sc.makeRDD(0 to 9, 3).takeSample(false,9)
//res23: Array[Int] = Array(6, 2, 1, 9, 3, 0, 8, 4, 5)

1.10 union
Return the union of this RDD and another one. Any identical elements will appear multiple times(use .distinct() to eliminate them).

def union(other: RDD[T]): RDD[T]
def ++(other: RDD[T]): RDD[T] = withScope {
this.union(other)
}

val r1 = sc.makeRDD(1 to 4)
val r2 = sc.makeRDD(3 to 6)
r1.union(r2).collect
//res24: Array[Int] = Array(1, 2, 3, 4, 3, 4, 5, 6)

1.11 sortBy
Return this RDD sorted by the given key function.

def sortByK
(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T]

val rdd1 = sc.parallelize(List((“tom”, 1), (“jerry”, 3), (“kitty”, 2)))
rdd1.sortBy(_._2, false).collect
//res26: Array[(String, Int)] = Array((jerry,3), (kitty,2), (tom,1))
rdd1.sortBy(x => x._2%2, false).collect
//res30: Array[(String, Int)] = Array((tom,1), (jerry,3), (kitty,2))

1.12 intersection
Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.

Note:This method performs a shuffle internally.

def intersection(
other: RDD[T],
partitioner: Partitioner)(implicit ord: Ordering[T] = null): RDD[T]
def intersection(other: RDD[T], numPartitions: Int): RDD[T]

val rdd1 = sc.parallelize(List((“tom”, 1), (“jerry”, 3), (“kitty”, 2)))
val rdd2 = sc.parallelize(List((“jerry”, 2), (“shuke”, 2), (“kitty”, 2)))
rdd1.intersection(rdd2).collect
//res32: Array[(String, Int)] = Array((kitty,2))

1.13 glom
Return an RDD created by coalescing all elements within each partition into an array.

def glom(): RDD[Array[T]]

sc.makeRDD(0 to 9, 3).glom.collect
//res34: Array[Array[Int]] = Array(Array(0, 1, 2), Array(3, 4, 5), Array(6, 7, 8, 9))

1.14 cartesian
Return the Cartesian product of this RDD and another one, that is, the RDD of all pairs of elements (a, b) where a is in this and b is in other.

def cartesianU: ClassTag: RDD[(T, U)]

val r1 = sc.makeRDD(1 to 3, 2)
val r2 = sc.makeRDD(4 to 6, 2)
r1.cartesian(r2).collect
//res1: Array[(Int, Int)] = Array((1,4), (1,5), (1,6), (2,4), (3,4), (2,5), (2,6), (3,5), (3,6))

1.15 groupBy
Return an RDD of grouped items.

Note: This operation may be very expensive.using PairRDDFunctions.aggregateByKey or PairRDDFunctions.reduceByKey will provide much better performance.

def groupByK(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])]
def groupByK(implicit kt: ClassTag[K]): RDD[(K, Iterable[T])]
def groupByK(implicit kt: ClassTag[K], ord: Ordering[K] = null)
: RDD[(K, Iterable[T])]

val rdd1 = sc.parallelize(List((“tom”, 1), (“jerry”, 3), (“kitty”, 2), (“tom”, 2), (“kitty”, 3)))
rdd1.groupBy(._2).collect
//res5: Array[(Int, Iterable[(String, Int)])] = Array((2,CompactBuffer((kitty,2), (tom,2))), (1,CompactBuffer((tom,1))), (3,CompactBuffer((jerry,3), (kitty,3))))
rdd1.groupBy(._1).collect
//res6: Array[(String, Iterable[(String, Int)])] = Array((tom,CompactBuffer((tom,1), (tom,2))), (jerry,CompactBuffer((jerry,3))), (kitty,CompactBuffer((kitty,2), (kitty,3))))
rdd1.groupByKey.collect
//res7: Array[(String, Iterable[Int])] = Array((tom,CompactBuffer(1, 2)), (jerry,CompactBuffer(3)), (kitty,CompactBuffer(2, 3)))

1.16 pipe
//这个函数调用其他脚本，把rdd的每个元素当作标准输入传入，同时接收标准输出作为新rdd的元素
Return an RDD created by piping elements to a forked external process.

def pipe(command: String): RDD[String]
def pipe(command: String, env: Map[String, String]): RDD[String]

vi /cube/bin/concat.sh

!/bin/bash

RESULT=””;
while read LINE; do
RESULT=${RESULT}” “${LINE}
done
echo ${RESULT}

val rdd = sc.makeRDD( List(“hi”, “how”, “are”, “you”, “fine”, “thank”, “you”, “and”, “you”), 2)
val pipeRDD = rdd.pipe(“/cube/bin/concat.sh”)
pipeRDD.collect
res22: Array[String] = Array(hi how are you, fine thank you and you)

1.17 mapPartitions
Return a new RDD by applying a function to each partition of this RDD.

def mapPartitionsU: ClassTag: RDD[U]

val rdd1 = sc.makeRDD(0 to 9, 3)
rdd1.mapPartitions(.toList.reverse.iterator).collect
//res0: Array[Int] = Array(4, 3, 2, 1, 0, 9, 8, 7, 6, 5)
rdd1.mapPartitions(.toList.sortWith(.compareTo() > 0).iterator).collect
//res4: Array[Int] = Array(2, 1, 0, 5, 4, 3, 9, 8, 7, 6)

1.18 mapPartitionsWithIndex
Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition.

def mapPartitionsWithIndexU: ClassTag: RDD[U]

val rdd1 = sc.makeRDD(0 to 9, 3)
rdd1.mapPartitionsWithIndex((i,x) => x.map(_+i*1000).toList.reverse.iterator).collect
//res7: Array[Int] = Array(2, 1, 0, 1005, 1004, 1003, 2009, 2008, 2007, 2006)

1.19 zip
Zips this RDD with another one, returning key-value pairs with the first element in each RDD, second element in each RDD, etc.

def zipU: ClassTag: RDD[(T, U)]

//分区和每个分区的元素个数必须一致
val rdd = sc.makeRDD( List(“hi”, “how”, “are”, “you”, “fine”, “thank”, “you”, “and”, “you”), 3)
val rdd1 = sc.makeRDD(1 to 9, 3)
rdd.zip(rdd1).collect
//res10: Array[(String, Int)] = Array((hi,1), (how,2), (are,3), (you,4), (fine,5), (thank,6), (you,7), (and,8), (you,9))

1.20 zipPartitions
Zip this RDD`s partitions with one (or more) RDD(s) and return a new RDD by applying a function to the zipped partitions.

def zipPartitions[B: ClassTag, V: ClassTag]
(rdd2: RDD[B], preservesPartitioning: Boolean)
(f: (Iterator[T], Iterator[B]) => Iterator[V]): RDD[V]

def zipPartitions[B: ClassTag, C: ClassTag, V: ClassTag]
(rdd2: RDD[B], rdd3: RDD[C], preservesPartitioning: Boolean)
(f: (Iterator[T], Iterator[B], Iterator[C]) => Iterator[V]): RDD[V]

def zipPartitions[B: ClassTag, C: ClassTag, D: ClassTag, V: ClassTag]
(rdd2: RDD[B], rdd3: RDD[C], rdd4: RDD[D], preservesPartitioning: Boolean)
(f: (Iterator[T], Iterator[B], Iterator[C], Iterator[D]) => Iterator[V]): RDD[V]

val rdd1 = sc.makeRDD(1 to 9, 3)
val rdd2 = sc.makeRDD( List(“hi”, “how”, “are”, “you”, “fine”, “thank”, “you”, “and”, “you”), 3)
rdd1.zipPartitions(rdd2){
(rdd1Iter,rdd2Iter) => {
var result = ListString
while(rdd1Iter.hasNext && rdd2Iter.hasNext) {
result::=(rdd1Iter.next() + “_” + rdd2Iter.next())
}
result.iterator
}
}.collect
//res22: Array[String] = Array(3_are, 2_how, 1_hi, 6_thank, 5_fine, 4_you, 9_you, 8_and, 7_you)

1.21 zipWithIndex
Zips this RDD with its element indices.

def zipWithIndex(): RDD[(T, Long)]

val rdd = sc.makeRDD( List(“hi”, “how”, “are”, “you”, “fine”, “thank”, “you”, “and”, “you”), 3)
rdd.zipWithIndex.collect
//res15: Array[(String, Long)] = Array((hi,0), (how,1), (are,2), (you,3), (fine,4), (thank,5), (you,6), (and,7), (you,8))

1.22 zipWithUniqueId
Zips this RDD with generated unique Long ids. Items in the kth partition will get ids k, n+k, 2*n+k, …,
def zipWithUniqueId(): RDD[(T, Long)]

val rdd = sc.makeRDD( List(“hi”, “how”, “are”, “you”, “fine”, “thank”, “you”, “and”, “you”), 3)
rdd.zipWithUniqueId.collect
//res16: Array[(String, Long)] = Array((hi,0), (how,3), (are,6), (you,1), (fine,4), (thank,7), (you,2), (and,5), (you,8))

Original: https://www.cnblogs.com/zhuisuidefeng/p/16354215.html
Author: 追随的风
Title: SparkRDD所有算子操作，建议全部手敲一遍

原创文章受到原创版权保护。转载请注明出处：https://www.johngo689.com/562197/

转载文章受原作者版权保护。转载请注明原作者出处！

大数据

【自取】最近整理的，有需要可以领取学习：

Linux核心资料大放送~

全栈面试题汇总（持续更新&可下载）

一个提高学习100%效率的工具！

【超详细】深度学习面试题目！

LeetCode Python刷题答案下载！

LeetCode Java版刷题答案下载！

LeetCode C++ 版本，抓紧保存！

LeetCode GO语言刷题答案下载！

TJU自然语言处理复习（1）

TJU自然语言处理复习（1）什么是自然语言处理？ * 基于规则的自然语言处理基于统计的自然语言处理 – n-gram语言模型统计语言模型语料库：统计NLP的知识…

大数据 2023年5月28日
00104
（1）通过FlinkSQL将数据写入mysql demo

（1）通过FlinkSQL将数据写入mysql demo 原创 wx5d37d5fd4aa622022-08-13 00:33:03©著作权文章标签 Flink 大数据 Flin…

大数据 2023年5月24日
0094
踩坑之旅：配置 ROS 环境

以下内容为本人的著作，如需要转载，请声明原文链接微信公众号「englyf」 https://mp.weixin.qq.com/s/IS2lkMud7x_u0aZKar9z3w 最近…

大数据 2023年5月26日
00116
Kudu_tool备份还原使用文档

日期版本修订说明修改人 2020-08-3 V1.0.0 创建文档王昱翔目录 1. 引言 3 1.1 背景介绍3 1.2 编写目的3 …

大数据 2023年5月26日
0062
2023秋招的第一个意向书

🌻今天和大家分享一下2023的第一个offer，把喜气传给大家，祝愿小伙伴在秋招中offer拿到手软。本篇博客主要和大家分享一下这段时间的学习过程，给大家一些参考。下面是字节的意向…

大数据 2023年11月12日
0051
Sqlite源码解读（十一）

接着winShmNode继续讲。 PShmNode上的引用计数已经在winShmEnterMutex()互斥体的覆盖下递增，并且指向pShmNode的新对象的指针已经设置。剩下要做…

大数据 2023年11月12日
0044
Hive Server 运行 SQL 慢排查手册

对于分布式系统, 任何地方出问题都有可能导致任务运行慢。本手册仅排查通过 Hive Server 执行 SQL 任务。通过 Hive Server 运行 SQL 主要设计 4 个…

大数据 2023年11月13日
0046
Linux(centos)安装jdk

Linux安装jdk 这里以centos7系统为例 01.卸载openjdk 由于centos默认安装的是openjdk，所以安装前需要彻底清理下openjdk。 a.查看已安装的…

大数据 2023年6月3日
0088
Docker——Idea连接远程并生成和上传镜像

1.Docker开启远程访问连接备注： 1）Linux是CentOS7版本 2）安装Docker可参考： https://www.cnblogs.com/tianhengblog…

大数据 2023年5月29日
0078
中望3D 2021 剖面曲率（剖切曲线生成）

路径：零件/装配环境>查询Ribbon >检查实体>剖面曲率使用此命令，对选定的面进行即时的剖面曲线的曲率分析。可同时生成剖切曲线。当选择每个新曲面时，系统…

大数据 2023年5月26日
0079
Redis关键知识点总结

大数据 2023年11月15日
0041
他来了！袋鼠云大数据基础平台EasyMR正式上线

7月28日，在袋鼠云2022产品发布会上，袋鼠云技术负责人思枢正式宣布旗下产品「大数据基础平台EasyMR」发布。 EasyMR是袋鼠云自研的大数据基础平台，提供Hadoop、Hi…

大数据 2023年6月2日
0087
SpringBoot教程(十四) | SpringBoot集成Redis(全网最全)

大数据 2023年11月14日
0038
启动hive时有报错

启动hive时出现下面错误： [bigdata@hadoop102 hive]$ hiveJava HotSpot(TM) 64-Bit Server VM warning: Us…

大数据 2023年11月13日
0039
大数据Presto（一）：Presto介绍

### 回答1：使用 Presto 集成开发时，可以使用 SQL 语句进行转换。例如，要将分转换为元，可以使用如下语句： SELECT amount / 100.0 AS am…

大数据 2023年11月13日
0037
7 年 Java 后端，一路北漂，一路心酸

本人Java开发6年半不到7年的样子。英语专业，临毕业跟着隔壁专业去培训了四个月Java。和他们一起，我来北京找了5个多月的工作，面试了100多家公司。最后，隔壁的专业学生在找…

大数据 2023年5月24日
0078

2024 年 5 月
一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

SparkRDD所有算子操作，建议全部手敲一遍

!/bin/bash

大家都在看