Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
spark
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
cs525-sp18-g07
spark
Commits
ca37e7b3
Commit
ca37e7b3
authored
14 years ago
by
Mosharaf Chowdhury
Browse files
Options
Downloads
Patches
Plain Diff
Renamed CustomParallelLocalFileShuffle
parent
c6df327d
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
conf/java-opts
+1
-1
1 addition, 1 deletion
conf/java-opts
src/scala/spark/CustomParallelLocalFileShuffle.scala
+17
-15
17 additions, 15 deletions
src/scala/spark/CustomParallelLocalFileShuffle.scala
with
18 additions
and
16 deletions
conf/java-opts
+
1
−
1
View file @
ca37e7b3
-Dspark.shuffle.class=spark.LocalFileShuffle
-Dspark.shuffle.UseHttpPipelining=true
-Dspark.shuffle.class=spark.
CustomParallel
LocalFileShuffle
This diff is collapsed.
Click to expand it.
src/scala/spark/LocalFileShuffle.scala
→
src/scala/spark/
CustomParallel
LocalFileShuffle.scala
+
17
−
15
View file @
ca37e7b3
...
...
@@ -9,12 +9,14 @@ import java.util.concurrent.{Executors, ThreadPoolExecutor, ThreadFactory}
import
scala.collection.mutable.
{
ArrayBuffer
,
HashMap
}
/**
* A simple implementation of shuffle using local files served through HTTP.
* An implementation of shuffle using local files served through custom server
* where receivers create simultaneous connections to multiple servers by
* setting the 'spark.parallelLocalFileShuffle.maxConnections' config option.
*
* TODO: Add support for compression when spark.compress is set to true.
*/
@serializable
class
LocalFileShuffle
[
K
,
V
,
C
]
extends
Shuffle
[
K
,
V
,
C
]
with
Logging
{
class
CustomParallel
LocalFileShuffle
[
K
,
V
,
C
]
extends
Shuffle
[
K
,
V
,
C
]
with
Logging
{
@transient
var
totalSplits
=
0
@transient
var
hasSplits
=
0
@transient
var
hasSplitsBitVector
:
BitSet
=
null
...
...
@@ -30,7 +32,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
:
RDD
[(
K
,
C
)]
=
{
val
sc
=
input
.
sparkContext
val
shuffleId
=
LocalFileShuffle
.
newShuffleId
()
val
shuffleId
=
CustomParallel
LocalFileShuffle
.
newShuffleId
()
logInfo
(
"Shuffle ID: "
+
shuffleId
)
val
splitRdd
=
new
NumberedSplitRDD
(
input
)
...
...
@@ -55,7 +57,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
}
for
(
i
<-
0
until
numOutputSplits
)
{
val
file
=
LocalFileShuffle
.
getOutputFile
(
shuffleId
,
myIndex
,
i
)
val
file
=
CustomParallel
LocalFileShuffle
.
getOutputFile
(
shuffleId
,
myIndex
,
i
)
val
writeStartTime
=
System
.
currentTimeMillis
logInfo
(
"BEGIN WRITE: "
+
file
)
val
out
=
new
ObjectOutputStream
(
new
FileOutputStream
(
file
))
...
...
@@ -65,7 +67,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
val
writeTime
=
(
System
.
currentTimeMillis
-
writeStartTime
)
logInfo
(
"Writing "
+
file
+
" of size "
+
file
.
length
+
" bytes took "
+
writeTime
+
" millis."
)
}
(
myIndex
,
LocalFileShuffle
.
serverAddress
,
LocalFileShuffle
.
serverPort
)
(
myIndex
,
CustomParallel
LocalFileShuffle
.
serverAddress
,
CustomParallel
LocalFileShuffle
.
serverPort
)
}).
collect
()
val
splitsByUri
=
new
ArrayBuffer
[(
String
,
Int
,
Int
)]
...
...
@@ -85,11 +87,11 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
combiners
=
new
HashMap
[
K
,
C
]
var
threadPool
=
LocalFileShuffle
.
newDaemonFixedThreadPool
(
LocalFileShuffle
.
MaxConnections
)
CustomParallel
LocalFileShuffle
.
newDaemonFixedThreadPool
(
CustomParallel
LocalFileShuffle
.
MaxConnections
)
while
(
hasSplits
<
totalSplits
)
{
var
numThreadsToCreate
=
Math
.
min
(
totalSplits
,
LocalFileShuffle
.
MaxConnections
)
-
Math
.
min
(
totalSplits
,
CustomParallel
LocalFileShuffle
.
MaxConnections
)
-
threadPool
.
getActiveCount
while
(
hasSplits
<
totalSplits
&&
numThreadsToCreate
>
0
)
{
...
...
@@ -113,7 +115,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
}
// Sleep for a while before creating new threads
Thread
.
sleep
(
LocalFileShuffle
.
MinKnockInterval
)
Thread
.
sleep
(
CustomParallel
LocalFileShuffle
.
MinKnockInterval
)
}
threadPool
.
shutdown
...
...
@@ -133,7 +135,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
}
if
(
requiredSplits
.
size
>
0
)
{
requiredSplits
(
LocalFileShuffle
.
ranGen
.
nextInt
(
requiredSplits
.
size
))
requiredSplits
(
CustomParallel
LocalFileShuffle
.
ranGen
.
nextInt
(
requiredSplits
.
size
))
}
else
{
-
1
}
...
...
@@ -160,7 +162,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
}
var
timeOutTimer
=
new
Timer
timeOutTimer
.
schedule
(
timeOutTask
,
LocalFileShuffle
.
MaxKnockInterval
)
timeOutTimer
.
schedule
(
timeOutTask
,
CustomParallel
LocalFileShuffle
.
MaxKnockInterval
)
logInfo
(
"ShuffleClient started... => %s:%d#%s"
.
format
(
hostAddress
,
listenPort
,
requestPath
))
...
...
@@ -254,7 +256,7 @@ class LocalFileShuffle[K, V, C] extends Shuffle[K, V, C] with Logging {
}
}
object
LocalFileShuffle
extends
Logging
{
object
CustomParallel
LocalFileShuffle
extends
Logging
{
// Used thoughout the code for small and large waits/timeouts
private
var
MinKnockInterval_
=
1000
private
var
MaxKnockInterval_
=
5000
...
...
@@ -279,12 +281,12 @@ object LocalFileShuffle extends Logging {
if
(!
initialized
)
{
// Load config parameters
MinKnockInterval_
=
System
.
getProperty
(
"spark.
s
huffle.MinKnockInterval"
,
"1000"
).
toInt
System
.
getProperty
(
"spark.
parallelLocalFileS
huffle.MinKnockInterval"
,
"1000"
).
toInt
MaxKnockInterval_
=
System
.
getProperty
(
"spark.
s
huffle.MaxKnockInterval"
,
"5000"
).
toInt
System
.
getProperty
(
"spark.
parallelLocalFileS
huffle.MaxKnockInterval"
,
"5000"
).
toInt
MaxConnections_
=
System
.
getProperty
(
"spark.
s
huffle.MaxConnections"
,
"4"
).
toInt
System
.
getProperty
(
"spark.
parallelLocalFileS
huffle.MaxConnections"
,
"4"
).
toInt
// TODO: localDir should be created by some mechanism common to Spark
// so that it can be shared among shuffle, broadcast, etc
...
...
@@ -366,7 +368,7 @@ object LocalFileShuffle extends Logging {
class
ShuffleServer
extends
Thread
with
Logging
{
var
threadPool
=
newDaemonFixedThreadPool
(
LocalFileShuffle
.
MaxConnections
)
var
threadPool
=
newDaemonFixedThreadPool
(
CustomParallel
LocalFileShuffle
.
MaxConnections
)
var
serverSocket
:
ServerSocket
=
null
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment