how to achive exactly once with flink sql when consume from kafka and then write to mysql

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

how to achive exactly once with flink sql when consume from kafka and then write to mysql

huliancheng
we are going to build our data computing system based on flink sql.
for now, with flink 1.11.0, we had achived a milestone: consuming from
kafka, then select from dynamic table, and write results to mysql.

but, when we test the exactly once(end to end), we found problem.

below are our sourcecode, shell and sql files:


sql file:
-- source, 使用计算列,uuid()在线生成uuid
CREATE TABLE user_log (
    user_id VARCHAR,
    item_id VARCHAR,
    category_id VARCHAR,
    behavior VARCHAR,
    ts TIMESTAMP(3),
    uuid as uuid()
) WITH (
    'connector.type' = 'kafka',
    'connector.version' = 'universal',
    'connector.topic' = 'user_behavior',
    'connector.startup-mode' = 'earliest-offset',
    'connector.properties.0.key' = 'zookeeper.connect',
    'connector.properties.0.value' = 'localhost:2181',
    'connector.properties.1.key' = 'bootstrap.servers',
    'connector.properties.1.value' = 'localhost:9092',
    'connector.properties.2.key' = 'group.id',
    'connector.properties.2.value' = 'test-consumer-group12',
    'update-mode' = 'append',
    'format.type' = 'json',
    'format.derive-schema' = 'true'
);

-- sink
CREATE TABLE pvuv_sink (
    uuid varchar,
    dt VARCHAR,
    pv BIGINT,
    uv BIGINT
) WITH (
    'connector.type' = 'jdbc',
    'connector.url' = 'jdbc:mysql://localhost:3306/flink_test',
    'connector.table' = 'pvuv_sink13',
    'connector.username' = 'root',
    'connector.password' = '123456',
    'connector.write.flush.max-rows' = '1',
    'connector.sink.semantic' = 'exactly-once'
);


INSERT INTO pvuv_sink
SELECT
  uuid,
  DATE_FORMAT(ts, 'yyyy-MM-dd HH:00') dt,
  COUNT(*) AS pv,
  COUNT(DISTINCT user_id) AS uv
FROM user_log
GROUP BY DATE_FORMAT(ts, 'yyyy-MM-dd HH:00'), uuid;


sql parse and concat file:
/**
 * 这是进行命令解析和提交的程序,整个工程入口
 */
public class SqlSubmit {

    public static void main(String[] args) throws Exception {
        // 解析命令行参数
        final CliOptions options = CliOptionsParser.parseClient(args);

        // 将解析好的命令行参数传递给SqlSubmit
        SqlSubmit submit = new SqlSubmit(options);

        // 运行程序
        submit.run();
    }

    //
--------------------------------------------------------------------------------------------
    private String sqlFilePath;
    private TableEnvironment tEnv;

    // 获取到sql执行文件的路径
    private SqlSubmit(CliOptions options) {
        this.sqlFilePath = options.getSqlFilePath();
    }

    private void run() throws Exception {
        // 创建flink执行的上下文对象
        StreamExecutionEnvironment environment =
StreamExecutionEnvironment.getExecutionEnvironment();
        this.tEnv = StreamTableEnvironment.create(environment,
               
EnvironmentSettings.newInstance().inStreamingMode().useBlinkPlanner().build());

        // 获取所有的sql文件行内容,转为字符串list
        List<String> sql = Files.readAllLines(Paths.get(sqlFilePath));
        List<SqlCommandParser.SqlCommandCall> calls =
SqlCommandParser.parse(sql);
        if (calls.size() == 0) {
            //no sql to execute
            throw new RuntimeException("There is no sql statement to
execute,please check your sql file: " + sqlFilePath);
        }
        for (SqlCommandParser.SqlCommandCall call : calls) {
//            System.out.println(call.command.toString());
            callCommand(call);
        }
    }

    //
--------------------------------------------------------------------------------------------

    private void callCommand(SqlCommandParser.SqlCommandCall cmdCall) {
        switch (cmdCall.command) {
            case SET:
                callSet(cmdCall);
                break;
            case CREATE_TABLE:
                callCreateTable(cmdCall);
                break;
            case INSERT_INTO:
                callInsertInto(cmdCall);
                break;
            default:
                throw new RuntimeException("Unsupported command: " +
cmdCall.command);
        }
    }

    private void callSet(SqlCommandParser.SqlCommandCall cmdCall) {
        String key = cmdCall.operands[0];
        String value = cmdCall.operands[1];
        tEnv.getConfig().getConfiguration().setString(key, value);
        System.out.println("设置 " + key + "-->" + value + " 成功");
    }

    private void callCreateTable(SqlCommandParser.SqlCommandCall cmdCall) {
        String ddl = cmdCall.operands[0];
        try {
            tEnv.executeSql(ddl);
        } catch (SqlParserException e) {
            throw new RuntimeException("SQL parse failed:\n" + ddl + "\n",
e);
        }
        String tableName = ddl.split("\\s+")[2];
        System.out.println("创建表 " + tableName + " 成功");
    }

    private void callInsertInto(SqlCommandParser.SqlCommandCall cmdCall) {
        String dml = cmdCall.operands[0];
        Optional<JobClient> jobClient;
        try {
            TableResult result = tEnv.executeSql(dml);
            jobClient = result.getJobClient();
        } catch (SqlParserException e) {
            throw new RuntimeException("SQL parse failed:\n" + dml + "\n",
e);
        }

        if (jobClient.isPresent()) {
            JobID jobID = jobClient.get().getJobID();
            System.out.println("任务提交成功,JobId: " + jobID);
        }
    }
}


shell to submit a job:
#!/bin/bash
export FLINK_HOME=/Users/hulc/developEnv/flink-1.11.0

sql_file=$2
# flink home检查
if [ -z "$FLINK_HOME" ];then
        echo "请指定FLINK_HOME 或者在该配置文件中配置"
        exit 1
fi

# 参数数量检查
if [ $# -lt 2 ];then
        echo "命令格式为 ./sql-submit.sh -f <sql-file>"
        exit 1
fi

# 要依赖的jar包,这里名字是写死的,后去可以使用传入参数
# SQL_JAR=./flink-sql-submit-1.0-SNAPSHOT.jar
SQL_JAR=./target/flink-test1-1.0-SNAPSHOT.jar
# 检查是否正确加载这个jar包
if [ -f $SQL_JAR ];then
        echo "`date +%Y-%m-%d" "%H:%M:%S` load jars from ${SQL_JAR}"
else
        echo "failed to load dependent jars for sql-submit.sh,please specify it"
        exit 1
fi

# 检查是否制定sql文件
if [ ! -f $sql_file ];then
        echo "sql文件 $sql_file 不存在,请检查文件路径"
        exit 1
fi

#提交命令, 注意这里的提交参数也是写死的,并行度5  main主类全名, 工程打出的jar包
# $1 就是 -f,也就是制定需要执行的文件参数
# $sql_file 就是制定需要执行的sql文件
if [ $1 = "-f" ];then
        $FLINK_HOME/bin/flink run -p 1 -c SqlSubmit
/Users/hulc/develop/flink-test1/target/flink-test1-1.0-SNAPSHOT.jar $1
$sql_file
else
        echo "命令格式为 ./sql-submit.sh -f <sql-file>"
        exit 1
fi





--
Sent from: http://apache-flink.147419.n8.nabble.com/