doulin4844 2019-03-15 00:58
浏览 9

使用Go SDK的管道在Cloud Dataflow中不是完全并行的

I have Apache Beam code implementation on Go SDK as described below. The pipeline has 3 steps. One is textio.Read, other one is CountLines and the last step is ProcessLines. ProcessLines step takes around 10 seconds time.

After I implemented the below code, I found that during pipeline execution, each step waits for completion of execution of the previous step and starts working in parallel after previous step is completed. Each step itself is running in parallel but only one step is running at a time within the pipeline, where the pipeline is not fully parallel right now.

This question is follow-up of the below question. Although the answer partially solves the problem, the pipeline currently does not run in fully parallel. Parallelism Problem on Cloud Dataflow using Go SDK

package main

import (
    "context"
    "flag"
    "time"

    "github.com/apache/beam/sdks/go/pkg/beam"
    "github.com/apache/beam/sdks/go/pkg/beam/io/textio"
    "github.com/apache/beam/sdks/go/pkg/beam/log"
    "github.com/apache/beam/sdks/go/pkg/beam/x/beamx"
)

// metrics to be monitored
var (
    input         = flag.String("input", "", "Input file (required).")
    numberOfLines = beam.NewCounter("extract", "numberOfLines")
    lineLen       = beam.NewDistribution("extract", "lineLenDistro")
)

func AddRandomKey(s beam.Scope, col beam.PCollection) beam.PCollection {
    return beam.ParDo(s, addRandomKeyFn, col)
}

func addRandomKeyFn(elm beam.T) (int, beam.T) {
    return rand.Int(), elm
}

func countLines(ctx context.Context, _ int, lines func(*string) bool, emit func(string)) {
    var line string
    for lines(&line) {
        lineLen.Update(ctx, int64(len(line)))
        numberOfLines.Inc(ctx, 1)
        emit(line)
    }
}
func processLines(ctx context.Context, _ int, lines func(*string) bool) {
    var line string
    for lines(&line) {
        time.Sleep(10 * time.Second)
        numberOfLinesProcess.Inc(ctx, 1)
    }
}

func CountLines(s beam.Scope, lines beam.PCollection) beam.PCollection {
    s = s.Scope("Count Lines")
    keyed := AddRandomKey(s, lines)
    grouped := beam.GroupByKey(s, keyed)

    return beam.ParDo(s, countLines, grouped)
}

func ProcessLines(s beam.Scope, lines beam.PCollection) {
    s = s.Scope("Process Lines")
    keyed := AddRandomKey(s, lines)
    grouped := beam.GroupByKey(s, keyed)

    beam.ParDo0(s, processLines, grouped)
}

func main() {
    // If beamx or Go flags are used, flags must be parsed first.
    flag.Parse()
    // beam.Init() is an initialization hook that must be called on startup. On
    // distributed runners, it is used to intercept control.
    beam.Init()

    // Input validation is done as usual. Note that it must be after Init().
    if *input == "" {
        log.Fatal(context.Background(), "No input file provided")
    }

    p := beam.NewPipeline()
    s := p.Root()

    l := textio.Read(s, *input)
    lines := CountLines(s, l)
    ProcessLines(s, lines)

    // Concept #1: The beamx.Run convenience wrapper allows a number of
    // pre-defined runners to be used via the --runner flag.
    if err := beamx.Run(context.Background(), p); err != nil {
        log.Fatalf(context.Background(), "Failed to execute job: %v", err.Error())
    }
}
  • 写回答

0条回答 默认 最新

    报告相同问题?

    悬赏问题

    • ¥500 火焰左右视图、视差(基于双目相机)
    • ¥100 set_link_state
    • ¥15 虚幻5 UE美术毛发渲染
    • ¥15 CVRP 图论 物流运输优化
    • ¥15 Tableau online 嵌入ppt失败
    • ¥100 支付宝网页转账系统不识别账号
    • ¥15 基于单片机的靶位控制系统
    • ¥15 真我手机蓝牙传输进度消息被关闭了,怎么打开?(关键词-消息通知)
    • ¥15 装 pytorch 的时候出了好多问题,遇到这种情况怎么处理?
    • ¥20 IOS游览器某宝手机网页版自动立即购买JavaScript脚本