flink on yarn配置问题

classic Classic list List threaded Threaded
9 messages Options
Reply | Threaded
Open this post in threaded view
|

flink on yarn配置问题

nobleyd
最近想试一下flink on yarn,yarn是公司之前就有的,但之前只运行过spark,现在想试一下flink。
但是不少报错,现在到如下情况了。

23:09:11.181 [main] ERROR com.xxx.Application - Main Method catched
exception: {}
org.apache.flink.client.deployment.ClusterDeploymentException: Could not
deploy Yarn job cluster.
        at
org.apache.flink.yarn.YarnClusterDescriptor.deployJobCluster(YarnClusterDescriptor.java:397)
        at
org.apache.flink.client.deployment.executors.AbstractJobClusterExecutor.execute(AbstractJobClusterExecutor.java:70)
        at
org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.executeAsync(StreamExecutionEnvironment.java:1733)
        at
org.apache.flink.streaming.api.environment.StreamContextEnvironment.executeAsync(StreamContextEnvironment.java:94)
        at
org.apache.flink.streaming.api.environment.StreamContextEnvironment.execute(StreamContextEnvironment.java:63)
        at
org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1620)
        at com.xxx.Application.main(Application.java:116)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at
org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:321)
        at
org.apache.flink.client.program.PackagedProgram.invokeInteractiveModeForExecution(PackagedProgram.java:205)
        at
org.apache.flink.client.ClientUtils.executeProgram(ClientUtils.java:138)
        at
org.apache.flink.client.cli.CliFrontend.executeProgram(CliFrontend.java:664)
        at org.apache.flink.client.cli.CliFrontend.run(CliFrontend.java:213)
        at
org.apache.flink.client.cli.CliFrontend.parseParameters(CliFrontend.java:895)
        at
org.apache.flink.client.cli.CliFrontend.lambda$main$10(CliFrontend.java:968)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
        at
org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
        at
org.apache.flink.client.cli.CliFrontend.main(CliFrontend.java:968)
Caused by: org.apache.flink.configuration.IllegalConfigurationException:
The number of requested virtual cores for application master 1 exceeds the
maximum number of virtual cores 0 available in the Yarn Cluster.
        at
org.apache.flink.yarn.YarnClusterDescriptor.isReadyForDeployment(YarnClusterDescriptor.java:283)
        at
org.apache.flink.yarn.YarnClusterDescriptor.deployInternal(YarnClusterDescriptor.java:444)
        at
org.apache.flink.yarn.YarnClusterDescriptor.deployJobCluster(YarnClusterDescriptor.java:390)
        ... 22 common frames omitted
org.apache.flink.client.deployment.ClusterDeploymentException: Could not
deploy Yarn job cluster.
        at
org.apache.flink.yarn.YarnClusterDescriptor.deployJobCluster(YarnClusterDescriptor.java:397)
        at
org.apache.flink.client.deployment.executors.AbstractJobClusterExecutor.execute(AbstractJobClusterExecutor.java:70)
        at
org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.executeAsync(StreamExecutionEnvironment.java:1733)
        at
org.apache.flink.streaming.api.environment.StreamContextEnvironment.executeAsync(StreamContextEnvironment.java:94)
        at
org.apache.flink.streaming.api.environment.StreamContextEnvironment.execute(StreamContextEnvironment.java:63)
        at
org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1620)
        at com.xxx.Application.main(Application.java:116)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at
org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:321)
        at
org.apache.flink.client.program.PackagedProgram.invokeInteractiveModeForExecution(PackagedProgram.java:205)
        at
org.apache.flink.client.ClientUtils.executeProgram(ClientUtils.java:138)
        at
org.apache.flink.client.cli.CliFrontend.executeProgram(CliFrontend.java:664)
        at org.apache.flink.client.cli.CliFrontend.run(CliFrontend.java:213)
        at
org.apache.flink.client.cli.CliFrontend.parseParameters(CliFrontend.java:895)
        at
org.apache.flink.client.cli.CliFrontend.lambda$main$10(CliFrontend.java:968)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
        at
org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
        at
org.apache.flink.client.cli.CliFrontend.main(CliFrontend.java:968)
Caused by: org.apache.flink.configuration.IllegalConfigurationException:
The number of requested virtual cores for application master 1 exceeds the
maximum number of virtual cores 0 available in the Yarn Cluster.
        at
org.apache.flink.yarn.YarnClusterDescriptor.isReadyForDeployment(YarnClusterDescriptor.java:283)
        at
org.apache.flink.yarn.YarnClusterDescriptor.deployInternal(YarnClusterDescriptor.java:444)
        at
org.apache.flink.yarn.YarnClusterDescriptor.deployJobCluster(YarnClusterDescriptor.java:390)
        ... 22 more

如上,猜测可能是配置问题。不清楚啥原因。也可能和权限有关系。有没有人知道报错原因,或者提供个定位方法的呢?日志...22more这种被省略的内容能进一步看到嘛话说。
Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

Zou Dan
Hi, 一旦, root cause 应该是下面这个日志
The number of requested virtual cores for application master 1 exceeds the
maximum number of virtual cores 0 available in the Yarn Cluster.

我简单看了一下代码,应该是你们 yarn 节点上没有可用的资源,numYarnMaxVcores = 0

> 2020年8月21日 下午11:11,赵一旦 <[hidden email] <mailto:[hidden email]>> 写道:
>
> The number of requested virtual cores for application master 1 exceeds the
> maximum number of virtual cores 0 available in the Yarn Cluster.

Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

nobleyd
嗯,直观看是这个问题。想知道这个问题有啥常见原因?这个报错只是最终原因,但不一定是直接原因。因为这个yarn集群不可能没资源,我只是简单实验下,我们的yarn是个超级集群,不可能没资源。
我猜测会不会是其他问题,比如yarn队列不对,导致没资源?再或者不清楚可不可能与yarn的鉴权有关,我们的yarn集群应该是有用户权限和资源配额限制的,但理论上我是从另外一个集群上抄的配置,不清楚有没有搞错。
原机器是用于提交spark任务的,我主要复制了hadoop部分(yarn)到另一个机器(B),用B这台机器计划做flink任务的提交。

Zou Dan <[hidden email]> 于2020年8月23日周日 下午2:16写道:

> Hi, 一旦, root cause 应该是下面这个日志
> The number of requested virtual cores for application master 1 exceeds the
> maximum number of virtual cores 0 available in the Yarn Cluster.
>
> 我简单看了一下代码,应该是你们 yarn 节点上没有可用的资源,numYarnMaxVcores = 0
>
> > 2020年8月21日 下午11:11,赵一旦 <[hidden email] <mailto:[hidden email]>>
> 写道:
> >
> > The number of requested virtual cores for application master 1 exceeds
> the
> > maximum number of virtual cores 0 available in the Yarn Cluster.
>
>
Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

魏烽
In reply to this post by nobleyd
Hi

一旦,提交任务的命令有嘛,可以发出来看看

或者在提交的时候指定一下提交任务到哪个队列

 原始邮件
发件人: 赵一旦<[hidden email]>
收件人: user-zh<[hidden email]>
发送时间: 2020年8月23日(周日) 22:58
主题: Re: flink on yarn配置问题


嗯,直观看是这个问题。想知道这个问题有啥常见原因?这个报错只是最终原因,但不一定是直接原因。因为这个yarn集群不可能没资源,我只是简单实验下,我们的yarn是个超级集群,不可能没资源。
我猜测会不会是其他问题,比如yarn队列不对,导致没资源?再或者不清楚可不可能与yarn的鉴权有关,我们的yarn集群应该是有用户权限和资源配额限制的,但理论上我是从另外一个集群上抄的配置,不清楚有没有搞错。
原机器是用于提交spark任务的,我主要复制了hadoop部分(yarn)到另一个机器(B),用B这台机器计划做flink任务的提交。

Zou Dan <[hidden email]<mailto:[hidden email]>> 于2020年8月23日周日 下午2:16写道:

> Hi, 一旦, root cause 应该是下面这个日志
> The number of requested virtual cores for application master 1 exceeds the
> maximum number of virtual cores 0 available in the Yarn Cluster.
>
> 我简单看了一下代码,应该是你们 yarn 节点上没有可用的资源,numYarnMaxVcores = 0
>
> > 2020年8月21日 下午11:11,赵一旦 <[hidden email]<mailto:[hidden email]> <mailto:[hidden email]<mailto:[hidden email]>>>
> 写道:
> >
> > The number of requested virtual cores for application master 1 exceeds
> the
> > maximum number of virtual cores 0 available in the Yarn Cluster.
>
>

Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

caozhen
In reply to this post by nobleyd

报错是申请AM时vcore不够

1、可以确认下当前队列是否有剩余vcore数
2、确认当前队列允许的最大应用数是否超了


之前遇到过这个问题原因是队列没有分配资源,跟你的可能不一样




--
Sent from: http://apache-flink.147419.n8.nabble.com/
Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

caozhen
This post was updated on .
In reply to this post by nobleyd
CONTENTS DELETED
The author has deleted this message.
Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

nobleyd
比如今天尝试了一波命令:./bin/yarn-session.sh -nm test_flink -q -qu upd_security -s 1
-tm 3024MB -jm 3024MB
同时我设置了 export HADOOP_USER_NAME=xxx
,这个在启动的时候会看到日志:org.apache.flink.runtime.security.modules.HadoopModule  -
Hadoop user set to upd_security (auth:SIMPLE)。

然后报错:

2020-08-24 10:52:31 ERROR org.apache.flink.yarn.cli.FlinkYarnSessionCli  -
Error while running the Flink session.
java.lang.RuntimeException: Couldn't get cluster description
        at
org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1254)
        at
org.apache.flink.yarn.cli.FlinkYarnSessionCli.run(FlinkYarnSessionCli.java:534)
        at
org.apache.flink.yarn.cli.FlinkYarnSessionCli.lambda$main$5(FlinkYarnSessionCli.java:785)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
        at
org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
        at
org.apache.flink.yarn.cli.FlinkYarnSessionCli.main(FlinkYarnSessionCli.java:785)
Caused by: java.lang.NullPointerException: null
        at
org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getChildQueues(YarnClientImpl.java:587)
        at
org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getAllQueues(YarnClientImpl.java:557)
        at
org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1247)
        ... 7 common frames omitted

------------------------------------------------------------
 The program finished with the following exception:

java.lang.RuntimeException: Couldn't get cluster description
        at
org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1254)
        at
org.apache.flink.yarn.cli.FlinkYarnSessionCli.run(FlinkYarnSessionCli.java:534)
        at
org.apache.flink.yarn.cli.FlinkYarnSessionCli.lambda$main$5(FlinkYarnSessionCli.java:785)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:422)
        at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
        at
org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
        at
org.apache.flink.yarn.cli.FlinkYarnSessionCli.main(FlinkYarnSessionCli.java:785)
Caused by: java.lang.NullPointerException
        at
org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getChildQueues(YarnClientImpl.java:587)
        at
org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getAllQueues(YarnClientImpl.java:557)
        at
org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1247)
        ... 7 more





caozhen <[hidden email]> 于2020年8月24日周一 上午10:00写道:

> 报错是 AM申请资源时vcore不够
>
> 1、可以确认当前队列是否有足够的vcore
> 2、确认当前队列允许允许的最大application数
>
> 我之前遇到这个问题是队列没有配置好资源导致
>
>
>
> --
> Sent from: http://apache-flink.147419.n8.nabble.com/
Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

Yang Wang
你确认upd_security这个queue是存在的吧,另外你Yarn集群的scheduler是capacityScheduler还是FairScheduler
如果是Fair的话,需要指定完整的queue名字,而不是叶子节点的


Best,
Yang

赵一旦 <[hidden email]> 于2020年8月24日周一 上午10:55写道:

> 比如今天尝试了一波命令:./bin/yarn-session.sh -nm test_flink -q -qu upd_security -s 1
> -tm 3024MB -jm 3024MB
> 同时我设置了 export HADOOP_USER_NAME=xxx
> ,这个在启动的时候会看到日志:org.apache.flink.runtime.security.modules.HadoopModule  -
> Hadoop user set to upd_security (auth:SIMPLE)。
>
> 然后报错:
>
> 2020-08-24 10:52:31 ERROR org.apache.flink.yarn.cli.FlinkYarnSessionCli  -
> Error while running the Flink session.
> java.lang.RuntimeException: Couldn't get cluster description
>         at
>
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1254)
>         at
>
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.run(FlinkYarnSessionCli.java:534)
>         at
>
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.lambda$main$5(FlinkYarnSessionCli.java:785)
>         at java.security.AccessController.doPrivileged(Native Method)
>         at javax.security.auth.Subject.doAs(Subject.java:422)
>         at
>
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
>         at
>
> org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
>         at
>
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.main(FlinkYarnSessionCli.java:785)
> Caused by: java.lang.NullPointerException: null
>         at
>
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getChildQueues(YarnClientImpl.java:587)
>         at
>
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getAllQueues(YarnClientImpl.java:557)
>         at
>
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1247)
>         ... 7 common frames omitted
>
> ------------------------------------------------------------
>  The program finished with the following exception:
>
> java.lang.RuntimeException: Couldn't get cluster description
>         at
>
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1254)
>         at
>
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.run(FlinkYarnSessionCli.java:534)
>         at
>
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.lambda$main$5(FlinkYarnSessionCli.java:785)
>         at java.security.AccessController.doPrivileged(Native Method)
>         at javax.security.auth.Subject.doAs(Subject.java:422)
>         at
>
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
>         at
>
> org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
>         at
>
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.main(FlinkYarnSessionCli.java:785)
> Caused by: java.lang.NullPointerException
>         at
>
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getChildQueues(YarnClientImpl.java:587)
>         at
>
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getAllQueues(YarnClientImpl.java:557)
>         at
>
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1247)
>         ... 7 more
>
>
>
>
>
> caozhen <[hidden email]> 于2020年8月24日周一 上午10:00写道:
>
> > 报错是 AM申请资源时vcore不够
> >
> > 1、可以确认当前队列是否有足够的vcore
> > 2、确认当前队列允许允许的最大application数
> >
> > 我之前遇到这个问题是队列没有配置好资源导致
> >
> >
> >
> > --
> > Sent from: http://apache-flink.147419.n8.nabble.com/
>
Reply | Threaded
Open this post in threaded view
|

Re: flink on yarn配置问题

nobleyd
这个问题暂停段时间,这部分比较复杂。可能还涉及到自定义的scheduler,以及自定义的hadoop鉴权方式等。目前我也不是很清楚还,还需要继续问问公司相关基础设施的同学。

Yang Wang <[hidden email]> 于2020年8月25日周二 上午11:21写道:

>
> 你确认upd_security这个queue是存在的吧,另外你Yarn集群的scheduler是capacityScheduler还是FairScheduler
> 如果是Fair的话,需要指定完整的queue名字,而不是叶子节点的
>
>
> Best,
> Yang
>
> 赵一旦 <[hidden email]> 于2020年8月24日周一 上午10:55写道:
>
> > 比如今天尝试了一波命令:./bin/yarn-session.sh -nm test_flink -q -qu upd_security -s 1
> > -tm 3024MB -jm 3024MB
> > 同时我设置了 export HADOOP_USER_NAME=xxx
> > ,这个在启动的时候会看到日志:org.apache.flink.runtime.security.modules.HadoopModule  -
> > Hadoop user set to upd_security (auth:SIMPLE)。
> >
> > 然后报错:
> >
> > 2020-08-24 10:52:31 ERROR org.apache.flink.yarn.cli.FlinkYarnSessionCli
> -
> > Error while running the Flink session.
> > java.lang.RuntimeException: Couldn't get cluster description
> >         at
> >
> >
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1254)
> >         at
> >
> >
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.run(FlinkYarnSessionCli.java:534)
> >         at
> >
> >
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.lambda$main$5(FlinkYarnSessionCli.java:785)
> >         at java.security.AccessController.doPrivileged(Native Method)
> >         at javax.security.auth.Subject.doAs(Subject.java:422)
> >         at
> >
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
> >         at
> >
> >
> org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
> >         at
> >
> >
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.main(FlinkYarnSessionCli.java:785)
> > Caused by: java.lang.NullPointerException: null
> >         at
> >
> >
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getChildQueues(YarnClientImpl.java:587)
> >         at
> >
> >
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getAllQueues(YarnClientImpl.java:557)
> >         at
> >
> >
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1247)
> >         ... 7 common frames omitted
> >
> > ------------------------------------------------------------
> >  The program finished with the following exception:
> >
> > java.lang.RuntimeException: Couldn't get cluster description
> >         at
> >
> >
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1254)
> >         at
> >
> >
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.run(FlinkYarnSessionCli.java:534)
> >         at
> >
> >
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.lambda$main$5(FlinkYarnSessionCli.java:785)
> >         at java.security.AccessController.doPrivileged(Native Method)
> >         at javax.security.auth.Subject.doAs(Subject.java:422)
> >         at
> >
> >
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1754)
> >         at
> >
> >
> org.apache.flink.runtime.security.HadoopSecurityContext.runSecured(HadoopSecurityContext.java:41)
> >         at
> >
> >
> org.apache.flink.yarn.cli.FlinkYarnSessionCli.main(FlinkYarnSessionCli.java:785)
> > Caused by: java.lang.NullPointerException
> >         at
> >
> >
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getChildQueues(YarnClientImpl.java:587)
> >         at
> >
> >
> org.apache.hadoop.yarn.client.api.impl.YarnClientImpl.getAllQueues(YarnClientImpl.java:557)
> >         at
> >
> >
> org.apache.flink.yarn.YarnClusterDescriptor.getClusterDescription(YarnClusterDescriptor.java:1247)
> >         ... 7 more
> >
> >
> >
> >
> >
> > caozhen <[hidden email]> 于2020年8月24日周一 上午10:00写道:
> >
> > > 报错是 AM申请资源时vcore不够
> > >
> > > 1、可以确认当前队列是否有足够的vcore
> > > 2、确认当前队列允许允许的最大application数
> > >
> > > 我之前遇到这个问题是队列没有配置好资源导致
> > >
> > >
> > >
> > > --
> > > Sent from: http://apache-flink.147419.n8.nabble.com/
> >
>