Just to add more information. I have checked the status of this file, not a single block is corrupted.
*[hadoop@ip-172-31-24-27 ~]$ hadoop fsck /ankit -files -blocks* *DEPRECATED: Use of this script to execute hdfs command is deprecated.* *Instead use the hdfs command for it.* Connecting to namenode via http://ip-172-31-24-27.us-west-2.compute.internal:9101 FSCK started by hadoop (auth:SIMPLE) from /172.31.24.27 for path /ankit at Sun Jul 19 19:11:37 UTC 2015 /ankit <dir> /ankit/SPARKSQLPOC-0.0.1-SNAPSHOT-jar-with-dependencies.jar 103599417 bytes, 1 block(s): OK 0. BP-511626939-172.31.24.27-1436185102368:blk_1073741964_126634 len=103599417 repl=1 *Status: HEALTHY* * Total size: 103599417 B* * Total dirs: 1* * Total files: 1* Total symlinks: 0 Total blocks (validated): 1 (avg. block size 103599417 B) Minimally replicated blocks: 1 (100.0 %) Over-replicated blocks: 0 (0.0 %) Under-replicated blocks: 0 (0.0 %) Mis-replicated blocks: 0 (0.0 %) Default replication factor: 1 Average block replication: 1.0 * Corrupt blocks: 0* Missing replicas: 0 (0.0 %) Number of data-nodes: 4 Number of racks: 1 FSCK ended at Sun Jul 19 19:11:37 UTC 2015 in 1 milliseconds On Mon, Jul 20, 2015 at 12:33 AM, ankit tyagi <ankittyagi.mn...@gmail.com> wrote: > Hi, > > I am using below code to trigger spark job from remote jvm. > > import org.apache.hadoop.conf.Configuration; > import org.apache.spark.SparkConf; > import org.apache.spark.deploy.yarn.Client; > import org.apache.spark.deploy.yarn.ClientArguments; > > /** > * @version 1.0, 15-Jul-2015 > * @author ankit > */ > > public class QueryEngineImpl implements IQueryEngine { > > SparkSqlEngine sqlEngine; > > public QueryEngineImpl(SparkSqlEngine sparkSqlEngine) { > this.sqlEngine = sparkSqlEngine; > } > > @Override > public void executeQuery(String query, String resultLocation, > String... parquetFileLocation) { > // TODO Auto-generated method stub > String[] args = new String[] { > // the name of your application > "--name", > "RemoteJVM", > > // memory for driver (optional) > "--driver-memory", > "1000M", > > // path to your application's JAR file > // required in yarn-cluster mode > "--jar", > "hdfs:// > 52.24.76.10:9000/ankit/SPARKSQLPOC-0.0.1-SNAPSHOT-jar-with-dependencies.jar > ", > > // name of your application's main class (required) > "--class", > "SparkSqlEngine", > > // argument 1 to your Spark program > (SparkFriendRecommendation) > "--arg", > query, > > // argument 2 to your Spark program > (SparkFriendRecommendation) > "--arg", > resultLocation, > > // argument 3 to your Spark program > (SparkFriendRecommendation) > "--arg", > parquetFileLocation[0], > > "--arg", > "yarn-cluster" }; > > Configuration conf = new Configuration(); > conf.set("yarn.resourcemanager.address", "52.24.76.10:9022"); > conf.set("HADOOP_HOME", "/home/hadoop"); > > System.setProperty("SPARK_YARN_MODE", "true"); > SparkConf sparkConf = new SparkConf(); > System.out.println("SPARK CONF" + sparkConf.toDebugString()); > // create ClientArguments, which will be passed to Client > org.apache.spark.deploy.yarn.ClientArguments cArgs = new > ClientArguments(args, sparkConf); > //create a insntance of yarn client > Client client = new Client(cArgs, conf, sparkConf); > > client.run(); > } > > public static void main(String[] args) { > QueryEngineImpl impl = new QueryEngineImpl(null); > impl.executeQuery("select count(*) from parquetTable", > "/tmp/ankit.txt", > "s3n://AKIAJPLOFN3DM27DIIUQ:zKsFTopwgmu4zNdAfZ5Xe+Qe0XtbegHLTgy629VB@hadoop-poc-ashish > /parquet"); > } > } > > > But I am getting below exception. > > *23:08:09.268 [main] WARN org.apache.hadoop.hdfs.DFSClient - Failed to > connect to /172.31.24.27:9200 <http://172.31.24.27:9200> for block, add to > deadNodes and continue. org.apache.hadoop.net.ConnectTimeoutException: > 60000 millis timeout while waiting for channel to be ready for connect. ch > : java.nio.channels.SocketChannel[connection-pending > remote=/172.31.24.27:9200 <http://172.31.24.27:9200>]* > org.apache.hadoop.net.ConnectTimeoutException: 60000 millis timeout while > waiting for channel to be ready for connect. ch : > java.nio.channels.SocketChannel[connection-pending remote=/ > 172.31.24.27:9200] > at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:532) > ~[hadoop-common-2.2.0.jar:na] > at > org.apache.hadoop.hdfs.DFSInputStream.newTcpPeer(DFSInputStream.java:955) > > > *23:08:09.269 [main] WARN org.apache.hadoop.hdfs.DFSClient - DFS Read* > *org.apache.hadoop.hdfs.BlockMissingException: Could not obtain block: > BP-511626939-172.31.24.27-1436185102368:blk_1073741964_126634 > file=/ankit/SPARKSQLPOC-0.0.1-SNAPSHOT-jar-with-dependencies.jar* > * at > org.apache.hadoop.hdfs.DFSInputStream.chooseDataNode(DFSInputStream.java:838) > [hadoop-hdfs-2.2.0.jar:na]* > * at > org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:526) > [hadoop-hdfs-2.2.0.jar:na]* > at > org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:749) > [hadoop-hdfs-2.2.0.jar:na] > at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:793) > [hadoop-hdfs-2.2.0.jar:na] > at java.io.DataInputStream.read(DataInputStream.java:100) [na:1.7.0_80] > > > Please suggest me is there any other way to trigger job from remote jvm?? >