Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
spark
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
cs525-sp18-g07
spark
Commits
2c966c98
Commit
2c966c98
authored
12 years ago
by
Josh Rosen
Browse files
Options
Downloads
Patches
Plain Diff
Change numSplits to numPartitions in PySpark.
parent
3b9f9294
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
python/pyspark/join.py
+10
-10
10 additions, 10 deletions
python/pyspark/join.py
python/pyspark/rdd.py
+28
-28
28 additions, 28 deletions
python/pyspark/rdd.py
with
38 additions
and
38 deletions
python/pyspark/join.py
+
10
−
10
View file @
2c966c98
...
...
@@ -32,13 +32,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
def
_do_python_join
(
rdd
,
other
,
num
Split
s
,
dispatch
):
def
_do_python_join
(
rdd
,
other
,
num
Partition
s
,
dispatch
):
vs
=
rdd
.
map
(
lambda
(
k
,
v
):
(
k
,
(
1
,
v
)))
ws
=
other
.
map
(
lambda
(
k
,
v
):
(
k
,
(
2
,
v
)))
return
vs
.
union
(
ws
).
groupByKey
(
num
Split
s
).
flatMapValues
(
dispatch
)
return
vs
.
union
(
ws
).
groupByKey
(
num
Partition
s
).
flatMapValues
(
dispatch
)
def
python_join
(
rdd
,
other
,
num
Split
s
):
def
python_join
(
rdd
,
other
,
num
Partition
s
):
def
dispatch
(
seq
):
vbuf
,
wbuf
=
[],
[]
for
(
n
,
v
)
in
seq
:
...
...
@@ -47,10 +47,10 @@ def python_join(rdd, other, numSplits):
elif
n
==
2
:
wbuf
.
append
(
v
)
return
[(
v
,
w
)
for
v
in
vbuf
for
w
in
wbuf
]
return
_do_python_join
(
rdd
,
other
,
num
Split
s
,
dispatch
)
return
_do_python_join
(
rdd
,
other
,
num
Partition
s
,
dispatch
)
def
python_right_outer_join
(
rdd
,
other
,
num
Split
s
):
def
python_right_outer_join
(
rdd
,
other
,
num
Partition
s
):
def
dispatch
(
seq
):
vbuf
,
wbuf
=
[],
[]
for
(
n
,
v
)
in
seq
:
...
...
@@ -61,10 +61,10 @@ def python_right_outer_join(rdd, other, numSplits):
if
not
vbuf
:
vbuf
.
append
(
None
)
return
[(
v
,
w
)
for
v
in
vbuf
for
w
in
wbuf
]
return
_do_python_join
(
rdd
,
other
,
num
Split
s
,
dispatch
)
return
_do_python_join
(
rdd
,
other
,
num
Partition
s
,
dispatch
)
def
python_left_outer_join
(
rdd
,
other
,
num
Split
s
):
def
python_left_outer_join
(
rdd
,
other
,
num
Partition
s
):
def
dispatch
(
seq
):
vbuf
,
wbuf
=
[],
[]
for
(
n
,
v
)
in
seq
:
...
...
@@ -75,10 +75,10 @@ def python_left_outer_join(rdd, other, numSplits):
if
not
wbuf
:
wbuf
.
append
(
None
)
return
[(
v
,
w
)
for
v
in
vbuf
for
w
in
wbuf
]
return
_do_python_join
(
rdd
,
other
,
num
Split
s
,
dispatch
)
return
_do_python_join
(
rdd
,
other
,
num
Partition
s
,
dispatch
)
def
python_cogroup
(
rdd
,
other
,
num
Split
s
):
def
python_cogroup
(
rdd
,
other
,
num
Partition
s
):
vs
=
rdd
.
map
(
lambda
(
k
,
v
):
(
k
,
(
1
,
v
)))
ws
=
other
.
map
(
lambda
(
k
,
v
):
(
k
,
(
2
,
v
)))
def
dispatch
(
seq
):
...
...
@@ -89,4 +89,4 @@ def python_cogroup(rdd, other, numSplits):
elif
n
==
2
:
wbuf
.
append
(
v
)
return
(
vbuf
,
wbuf
)
return
vs
.
union
(
ws
).
groupByKey
(
num
Split
s
).
mapValues
(
dispatch
)
return
vs
.
union
(
ws
).
groupByKey
(
num
Partition
s
).
mapValues
(
dispatch
)
This diff is collapsed.
Click to expand it.
python/pyspark/rdd.py
+
28
−
28
View file @
2c966c98
...
...
@@ -215,7 +215,7 @@ class RDD(object):
yield
pair
return
java_cartesian
.
flatMap
(
unpack_batches
)
def
groupBy
(
self
,
f
,
num
Split
s
=
None
):
def
groupBy
(
self
,
f
,
num
Partition
s
=
None
):
"""
Return an RDD of grouped items.
...
...
@@ -224,7 +224,7 @@ class RDD(object):
>>>
sorted
([(
x
,
sorted
(
y
))
for
(
x
,
y
)
in
result
])
[(
0
,
[
2
,
8
]),
(
1
,
[
1
,
1
,
3
,
5
])]
"""
return
self
.
map
(
lambda
x
:
(
f
(
x
),
x
)).
groupByKey
(
num
Split
s
)
return
self
.
map
(
lambda
x
:
(
f
(
x
),
x
)).
groupByKey
(
num
Partition
s
)
def
pipe
(
self
,
command
,
env
=
{}):
"""
...
...
@@ -274,7 +274,7 @@ class RDD(object):
def
reduce
(
self
,
f
):
"""
Reduces the elements of this RDD using the specified commutative and
Reduces the elements of this RDD using the specified commutative and
associative binary operator.
>>>
from
operator
import
add
...
...
@@ -422,22 +422,22 @@ class RDD(object):
"""
return
dict
(
self
.
collect
())
def
reduceByKey
(
self
,
func
,
num
Split
s
=
None
):
def
reduceByKey
(
self
,
func
,
num
Partition
s
=
None
):
"""
Merge the values for each key using an associative reduce function.
This will also perform the merging locally on each mapper before
sending results to a reducer, similarly to a
"
combiner
"
in MapReduce.
Output will be hash-partitioned with C{num
Splits} split
s, or
the
default parallelism level if C{num
Split
s} is not specified.
Output will be hash-partitioned with C{num
Partitions} partition
s, or
the
default parallelism level if C{num
Partition
s} is not specified.
>>>
from
operator
import
add
>>>
rdd
=
sc
.
parallelize
([(
"
a
"
,
1
),
(
"
b
"
,
1
),
(
"
a
"
,
1
)])
>>>
sorted
(
rdd
.
reduceByKey
(
add
).
collect
())
[(
'
a
'
,
2
),
(
'
b
'
,
1
)]
"""
return
self
.
combineByKey
(
lambda
x
:
x
,
func
,
func
,
num
Split
s
)
return
self
.
combineByKey
(
lambda
x
:
x
,
func
,
func
,
num
Partition
s
)
def
reduceByKeyLocally
(
self
,
func
):
"""
...
...
@@ -474,7 +474,7 @@ class RDD(object):
"""
return
self
.
map
(
lambda
x
:
x
[
0
]).
countByValue
()
def
join
(
self
,
other
,
num
Split
s
=
None
):
def
join
(
self
,
other
,
num
Partition
s
=
None
):
"""
Return an RDD containing all pairs of elements with matching keys in
C{self} and C{other}.
...
...
@@ -489,9 +489,9 @@ class RDD(object):
>>>
sorted
(
x
.
join
(
y
).
collect
())
[(
'
a
'
,
(
1
,
2
)),
(
'
a
'
,
(
1
,
3
))]
"""
return
python_join
(
self
,
other
,
num
Split
s
)
return
python_join
(
self
,
other
,
num
Partition
s
)
def
leftOuterJoin
(
self
,
other
,
num
Split
s
=
None
):
def
leftOuterJoin
(
self
,
other
,
num
Partition
s
=
None
):
"""
Perform a left outer join of C{self} and C{other}.
...
...
@@ -506,9 +506,9 @@ class RDD(object):
>>>
sorted
(
x
.
leftOuterJoin
(
y
).
collect
())
[(
'
a
'
,
(
1
,
2
)),
(
'
b
'
,
(
4
,
None
))]
"""
return
python_left_outer_join
(
self
,
other
,
num
Split
s
)
return
python_left_outer_join
(
self
,
other
,
num
Partition
s
)
def
rightOuterJoin
(
self
,
other
,
num
Split
s
=
None
):
def
rightOuterJoin
(
self
,
other
,
num
Partition
s
=
None
):
"""
Perform a right outer join of C{self} and C{other}.
...
...
@@ -523,10 +523,10 @@ class RDD(object):
>>>
sorted
(
y
.
rightOuterJoin
(
x
).
collect
())
[(
'
a
'
,
(
2
,
1
)),
(
'
b
'
,
(
None
,
4
))]
"""
return
python_right_outer_join
(
self
,
other
,
num
Split
s
)
return
python_right_outer_join
(
self
,
other
,
num
Partition
s
)
# TODO: add option to control map-side combining
def
partitionBy
(
self
,
num
Split
s
,
partitionFunc
=
hash
):
def
partitionBy
(
self
,
num
Partition
s
,
partitionFunc
=
hash
):
"""
Return a copy of the RDD partitioned using the specified partitioner.
...
...
@@ -535,22 +535,22 @@ class RDD(object):
>>>
set
(
sets
[
0
]).
intersection
(
set
(
sets
[
1
]))
set
([])
"""
if
num
Split
s
is
None
:
num
Split
s
=
self
.
ctx
.
defaultParallelism
if
num
Partition
s
is
None
:
num
Partition
s
=
self
.
ctx
.
defaultParallelism
# Transferring O(n) objects to Java is too expensive. Instead, we'll
# form the hash buckets in Python, transferring O(num
Split
s) objects
# form the hash buckets in Python, transferring O(num
Partition
s) objects
# to Java. Each object is a (splitNumber, [objects]) pair.
def
add_shuffle_key
(
split
,
iterator
):
buckets
=
defaultdict
(
list
)
for
(
k
,
v
)
in
iterator
:
buckets
[
partitionFunc
(
k
)
%
num
Split
s
].
append
((
k
,
v
))
buckets
[
partitionFunc
(
k
)
%
num
Partition
s
].
append
((
k
,
v
))
for
(
split
,
items
)
in
buckets
.
iteritems
():
yield
str
(
split
)
yield
dump_pickle
(
Batch
(
items
))
keyed
=
PipelinedRDD
(
self
,
add_shuffle_key
)
keyed
.
_bypass_serializer
=
True
pairRDD
=
self
.
ctx
.
_jvm
.
PairwiseRDD
(
keyed
.
_jrdd
.
rdd
()).
asJavaPairRDD
()
partitioner
=
self
.
ctx
.
_jvm
.
PythonPartitioner
(
num
Split
s
,
partitioner
=
self
.
ctx
.
_jvm
.
PythonPartitioner
(
num
Partition
s
,
id
(
partitionFunc
))
jrdd
=
pairRDD
.
partitionBy
(
partitioner
).
values
()
rdd
=
RDD
(
jrdd
,
self
.
ctx
)
...
...
@@ -561,7 +561,7 @@ class RDD(object):
# TODO: add control over map-side aggregation
def
combineByKey
(
self
,
createCombiner
,
mergeValue
,
mergeCombiners
,
num
Split
s
=
None
):
num
Partition
s
=
None
):
"""
Generic function to combine the elements for each key using a custom
set of aggregation functions.
...
...
@@ -586,8 +586,8 @@ class RDD(object):
>>>
sorted
(
x
.
combineByKey
(
str
,
add
,
add
).
collect
())
[(
'
a
'
,
'
11
'
),
(
'
b
'
,
'
1
'
)]
"""
if
num
Split
s
is
None
:
num
Split
s
=
self
.
ctx
.
defaultParallelism
if
num
Partition
s
is
None
:
num
Partition
s
=
self
.
ctx
.
defaultParallelism
def
combineLocally
(
iterator
):
combiners
=
{}
for
(
k
,
v
)
in
iterator
:
...
...
@@ -597,7 +597,7 @@ class RDD(object):
combiners
[
k
]
=
mergeValue
(
combiners
[
k
],
v
)
return
combiners
.
iteritems
()
locally_combined
=
self
.
mapPartitions
(
combineLocally
)
shuffled
=
locally_combined
.
partitionBy
(
num
Split
s
)
shuffled
=
locally_combined
.
partitionBy
(
num
Partition
s
)
def
_mergeCombiners
(
iterator
):
combiners
=
{}
for
(
k
,
v
)
in
iterator
:
...
...
@@ -609,10 +609,10 @@ class RDD(object):
return
shuffled
.
mapPartitions
(
_mergeCombiners
)
# TODO: support variant with custom partitioner
def
groupByKey
(
self
,
num
Split
s
=
None
):
def
groupByKey
(
self
,
num
Partition
s
=
None
):
"""
Group the values for each key in the RDD into a single sequence.
Hash-partitions the resulting RDD with into num
Split
s partitions.
Hash-partitions the resulting RDD with into num
Partition
s partitions.
>>>
x
=
sc
.
parallelize
([(
"
a
"
,
1
),
(
"
b
"
,
1
),
(
"
a
"
,
1
)])
>>>
sorted
(
x
.
groupByKey
().
collect
())
...
...
@@ -630,7 +630,7 @@ class RDD(object):
return
a
+
b
return
self
.
combineByKey
(
createCombiner
,
mergeValue
,
mergeCombiners
,
num
Split
s
)
num
Partition
s
)
# TODO: add tests
def
flatMapValues
(
self
,
f
):
...
...
@@ -659,7 +659,7 @@ class RDD(object):
return
self
.
cogroup
(
other
)
# TODO: add variant with custom parittioner
def
cogroup
(
self
,
other
,
num
Split
s
=
None
):
def
cogroup
(
self
,
other
,
num
Partition
s
=
None
):
"""
For each key k in C{self} or C{other}, return a resulting RDD that
contains a tuple with the list of values for that key in C{self} as well
...
...
@@ -670,7 +670,7 @@ class RDD(object):
>>>
sorted
(
x
.
cogroup
(
y
).
collect
())
[(
'
a
'
,
([
1
],
[
2
])),
(
'
b
'
,
([
4
],
[]))]
"""
return
python_cogroup
(
self
,
other
,
num
Split
s
)
return
python_cogroup
(
self
,
other
,
num
Partition
s
)
# TODO: `lookup` is disabled because we can't make direct comparisons based
# on the key; we need to compare the hash of the key to the hash of the
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment