返回顶部

收藏

Java读取Html文本解析email地址的代码

更多
package com.alpha.test;import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;/**
 * 读取html页面文件解析邮箱地址
 * 
 * @author JavaAlpha 2012-12-19 13:45:11
 */
public class ReadHtmlToTxt { // 读取文件
 public static String readHtml(String path) {  StringBuffer emailCont = new StringBuffer();  File htmlFile = new File(path);
  if (htmlFile.exists() && htmlFile.isFile() && htmlFile.canRead()) {
   Reader in;
   try {
    in = new FileReader(htmlFile);
    char[] buff = new char[4096];
    int nch;
    while ((nch = in.read(buff, 0, buff.length)) != -1) {
     emailCont.append(checkEmail(new String(buff, 0, nch)));
    }
   } catch (FileNotFoundException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }  }  return emailCont.toString();
 } // 判断字符串里面是否包括@符号
 public static String checkEmail(String str) {  String postCont = "";
  // 判断是否回复的内容
  if (str.indexOf("@") > -1) {   postCont = str.substring(str.indexOf("@") - 10,
     str.indexOf("@") + 10);   if (postCont.indexOf(">") > -1 || postCont.indexOf("<") > -1) {
    postCont = postCont.replaceAll(">", "");
    postCont = postCont.replaceAll("<", "");
    postCont = postCont.replaceAll("/", "");
   }   if (postCont.indexOf(",") > -1 || postCont.indexOf(",") > -1
     || postCont.indexOf("。") > -1 || postCont.indexOf(";") > -1) {
    postCont = postCont.replaceAll(",", "");
    postCont = postCont.replaceAll(",", "");
    postCont = postCont.replaceAll("。", "");
   }   postCont = postCont.substring(0, postCont.indexOf(".com") + 4);   System.out.println(postCont);
  }  return postCont;
 }

 //过滤汉字
 public static boolean checkChinese(String str) {

  String regEx = "[\\u4e00-\\u9fa5]";
  Pattern p = Pattern.compile(regEx);
  Matcher m = p.matcher(str);
  if (m != null && m.find()){
   return true;//是汉字
  }
  return false;
 } // 将整理是邮箱地址写入文件
 public static void writerFile(String cont, String path) {  File emailFile = new File(path);  try {
   //如果文件不存在,创建文件
   if (!emailFile.exists()) {
    emailFile.createNewFile();
   }

   Writer out = new FileWriter(emailFile);   out.write(cont);
   out.flush();
   out.close();
  } catch (Exception e) {
   e.printStackTrace();
  } }

 /**
  * 读取网络内容 
  */
 public static void readUrlCont(String strUrl) {

  StringBuffer cont = new StringBuffer();//内容

  try {
   URL url = new URL(strUrl);
   URLConnection conn = url.openConnection();
   BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
   String lineCont = "";
   while ((lineCont = reader.readLine())!= null) {
    cont.append(lineCont+"</br>");
   }

   reader.close();

  } catch (MalformedURLException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }

  System.out.println(cont.toString());
 } public static void main(String[] args) {

  //String cont = readHtml("e://test.htm");//读取文件

  //writerFile(cont, "e://test.txt");//写文件

  //checkChinese("qwe123");

  readUrlCont("http://www.163.com");

 }}

标签:html解析,email,java

收藏

0人收藏

支持

0

反对

0

发表评论