HDFS和本地文件系统文件互导_教程

初步了解一下情况，后续根据给出案例

一、从本地文件系统到HDFS

使用hdfs自带的命令

命令：hdfs dfs -copyFromLocal inputPath outputPath

inputPath：本地文件目录的路径

outputPath：hdfs文件目录路径，即存储路径

二、从HDFS到本地文件系统

命令：hdfs dfs -copyToLocal inputPath outputPath

inputPath：hdfs文件目录

outputPath：本地文件文件目录，即本地存储路径

因为Hbas和Hive都在存储在HDFS中，所以可以通过该条命令可以把Hbase和Hive存储在HDFS中的文件复制出来。但是经过实践，通过这种方式复制出来的Hbase文件是乱码。Hive里的文件有时候也会乱码，这取决于Hive数据的插入方式。

三、文件在HDFS内的移动

1、从Hbase表导出数据到HDFS

命令：hbase org.apache.hadoop.hbase.mapreduce.Export tableName outputPaht

例子：hbase org.apache.hadoop.hbase.mapreduce.Export test /user/data

test为需要从Hbase中导出的表，／user/data为hdfs上的路径，即存储路径，如果最后一个参数有前缀file：// 则为本地上的文件存储系统

2、从HDFS导入到Hbase表中，需要事先建立好表结构

命令：hbase org.apache.hadoop.hbase.mapreduce.Export tableName inputPaht

例子：hbase org.apache.hadoop.hbase.mapreduce.Import test1 /temp/part-m-00000

案列：

两个不同环境数据，数据导入

过程描述：

导出正式环境数据到hdfs中，然后从hdfs中导出到本地，本地传到测试环境主机，然后从本地导入到hdfs中，再从hdfs中导入到hbase中。

处理过程：

1、注意事项：1、权限问题使用hdfs：sudo -u hdfs ；

2、存放上传路径最好不要在root下

3、上传完成后，查看是否在使用，数据已经插入。

1、sudo -u hdfs hbase org.apache.hadoop.hbase.mapreduce.Export ** /hbase/**_bak （导出到hdfs中的**_bak）

2、hdfs dfs -copyToLocal /hbase/sw_bak /test (导出hdfs中文件到本地test，注：提前建好目录)

3、scp -r test_bak root@192.168.90.**:/root/test （传送目录到测试环境主机目录下，注：传到测试环境后，把文件不要放到root的目录下，换家目录下）

4、sudo -u hdfs hdfs dfs -copyFromLocal /chenzeng/text_bak /data （把sw传到hdfs 中，注意上传时，文件路径要对，放在data路径下比较好）

5、sudo -u hdfs hbase org.apache.hadoop.hbase.mapreduce.Import test /data/test_bak/part-m-0000 （注意上次文件）

6、在hbase shell 中查看test ：count 'test' 确认是否上传成功

优化：

truncate ‘’

正式环境导入至hdfs中时，

可以直接在另一个环境的执行sudo -u hdfs hbase org.apache.hadoop.hbase.mapreduce.Import test hdfs://server243:8020/hbase**** 可以直接加主机和对应路径进行put。

Java API读写HDFS

public class FSOptr {

/**

* @param args

public static void main(String[] args) throws Exception {

// TODO Auto-generated method stub

Configuration conf = new Configuration()

makeDir(conf)

rename(conf)

delete(conf)

}

// 创建文件目录

private static void makeDir(Configuration conf) throws Exception {

FileSystem fs = FileSystem.get(conf)

Path dir = new Path("/user/hadoop/data/20140318")

boolean result = fs.mkdirs(dir)// 创建文件夹

System.out.println("make dir :" + result)

// 创建文件，并写入内容

Path dst = new Path("/user/hadoop/data/20140318/tmp")

byte[] buff = "hello,hadoop!".getBytes()

FSDataOutputStream outputStream = fs.create(dst)

outputStream.write(buff, 0, buff.length)

outputStream.close()

FileStatus files[] = fs.listStatus(dst)

for (FileStatus file : files) {

System.out.println(file.getPath())

}

fs.close()

}

// 重命名文件

private static void rename(Configuration conf) throws Exception {

FileSystem fs = FileSystem.get(conf)

Path oldName = new Path("/user/hadoop/data/20140318/1.txt")

Path newName = new Path("/user/hadoop/data/20140318/2.txt")

fs.rename(oldName, newName)

FileStatus files[] = fs.listStatus(new Path(

"/user/hadoop/data/20140318"))

for (FileStatus file : files) {

System.out.println(file.getPath())

}

fs.close()

}

// 删除文件

@SuppressWarnings("deprecation")

private static void delete(Configuration conf) throws Exception {

FileSystem fs = FileSystem.get(conf)

Path path = new Path("/user/hadoop/data/20140318")

if (fs.isDirectory(path)) {

FileStatus files[] = fs.listStatus(path)

for (FileStatus file : files) {

fs.delete(file.getPath())

}

} else {

fs.delete(path)

}

// 或者

fs.delete(path, true)

fs.close()

}

/**

* 下载,将hdfs文件下载到本地磁盘

* @param localSrc1

*本地的文件地址，即文件的路径

* @param hdfsSrc1

*存放在hdfs的文件地址

public boolean sendFromHdfs(String hdfsSrc1, String localSrc1) {

Configuration conf = new Configuration()

FileSystem fs = null

try {

fs = FileSystem.get(URI.create(hdfsSrc1), conf)

Path hdfs_path = new Path(hdfsSrc1)

Path local_path = new Path(localSrc1)

fs.copyToLocalFile(hdfs_path, local_path)

return true

} catch (IOException e) {

e.printStackTrace()

}

return false

}

/**

* 上传，将本地文件copy到hdfs系统中

* @param localSrc

*本地的文件地址，即文件的路径

* @param hdfsSrc

*存放在hdfs的文件地址

public boolean sendToHdfs1(String localSrc, String hdfsSrc) {

InputStream in

try {

in = new BufferedInputStream(new FileInputStream(localSrc))

Configuration conf = new Configuration()// 得到配置对象

FileSystem fs// 文件系统

try {

fs = FileSystem.get(URI.create(hdfsSrc), conf)

// 输出流，创建一个输出流

OutputStream out = fs.create(new Path(hdfsSrc),

new Progressable() {

// 重写progress方法

public void progress() {

// System.out.println("上传完一个设定缓存区大小容量的文件！")

}

})

// 连接两个流，形成通道，使输入流向输出流传输数据,

IOUtils.copyBytes(in, out, 10240, true)// in为输入流对象，out为输出流对象，4096为缓冲区大小，true为上传后关闭流

return true

} catch (IOException e) {

e.printStackTrace()

}

} catch (FileNotFoundException e) {

e.printStackTrace()

}

return false

}

/**

* 移动

* @param old_st原来存放的路径

* @param new_st移动到的路径

public boolean moveFileName(String old_st, String new_st) {

try {

// 下载到服务器本地

boolean down_flag = sendFromHdfs(old_st, "/home/hadoop/文档/temp")

Configuration conf = new Configuration()

FileSystem fs = null

// 删除源文件

try {

fs = FileSystem.get(URI.create(old_st), conf)

Path hdfs_path = new Path(old_st)

fs.delete(hdfs_path)

} catch (IOException e) {

e.printStackTrace()

}

// 从服务器本地传到新路径

new_st = new_st + old_st.substring(old_st.lastIndexOf("/"))

boolean uplod_flag = sendToHdfs1("/home/hadoop/文档/temp", new_st)

if (down_flag &&uplod_flag) {

return true

}

} catch (Exception e) {

e.printStackTrace()

}

return false

}

// copy本地文件到hdfs

private static void CopyFromLocalFile(Configuration conf) throws Exception {

FileSystem fs = FileSystem.get(conf)

Path src = new Path("/home/hadoop/word.txt")

Path dst = new Path("/user/hadoop/data/")

fs.copyFromLocalFile(src, dst)

fs.close()

}

// 获取给定目录下的所有子目录以及子文件

private static void getAllChildFile(Configuration conf) throws Exception {

FileSystem fs = FileSystem.get(conf)

Path path = new Path("/user/hadoop")

getFile(path, fs)

}

private static void getFile(Path path, FileSystem fs)throws Exception {

FileStatus[] fileStatus = fs.listStatus(path)

for (int i = 0i <fileStatus.lengthi++) {

if (fileStatus[i].isDir()) {

Path p = new Path(fileStatus[i].getPath().toString())

getFile(p, fs)

} else {

System.out.println(fileStatus[i].getPath().toString())

}

//判断文件是否存在

private static boolean isExist(Configuration conf,String path)throws Exception{

FileSystem fileSystem = FileSystem.get(conf)

return fileSystem.exists(new Path(path))

}

//获取hdfs集群所有主机结点数据

private static void getAllClusterNodeInfo(Configuration conf)throws Exception{

FileSystem fs = FileSystem.get(conf)

DistributedFileSystem hdfs = (DistributedFileSystem)fs

DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats()

String[] names = new String[dataNodeStats.length]

System.out.println("list of all the nodes in HDFS cluster:")//print info

for(int i=0i <dataNodeStats.lengthi++){

names[i] = dataNodeStats[i].getHostName()

System.out.println(names[i])//print info

}

//get the locations of a file in HDFS

private static void getFileLocation(Configuration conf)throws Exception{

FileSystem fs = FileSystem.get(conf)

Path f = new Path("/user/cluster/dfs.txt")

FileStatus filestatus = fs.getFileStatus(f)

BlockLocation[] blkLocations = fs.getFileBlockLocations(filestatus,0,filestatus.getLen())

int blkCount = blkLocations.length

for(int i=0i <blkCounti++){

String[] hosts = blkLocations[i].getHosts()

//Do sth with the block hosts

System.out.println(hosts)

}

//get HDFS file last modification time

private static void getModificationTime(Configuration conf)throws Exception{

FileSystem fs = FileSystem.get(conf)

Path f = new Path("/user/cluster/dfs.txt")

FileStatus filestatus = fs.getFileStatus(f)

long modificationTime = filestatus.getModificationTime()// measured in milliseconds since the epoch

Date d = new Date(modificationTime)

System.out.println(d)

}

默认是从hdfs读取文件，也可以指定sc.textFile("路径").在路径前面加上hdfs://表示从hdfs文件系统上读

本地文件读取 sc.textFile("路径").在路径前面加上file:// 表示从本地文件系统读，如file:///home/user/spark/README.md

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/tougao/8069653.html

HDFS和本地文件系统文件互导

发表评论

评论列表（0条）