返回顶部

收藏

java判断utf8字符包含几个字节(utf8mb4判断)

更多

在mysql中设置utf8-general-ci时是无法插入4字节utf-8字符的,如果mysql中设置了utf8-general-ci在插入4字节字符时则会出问题, 为了避免此类问题, 可以提前对字符串做判断, 判断方法的代码片段如下:

import java.nio.charset.Charset;

/**
 * 处理由于mysql配置字符编码utf8 general ci, 而应用会接受utf8-mb4字节的情况
 * Created by zhaoyukai on 2017/9/28.
 */
public class UTF8MB4 {
    private final static Charset UTF8 = Charset.forName("UTF-8");

    /**
     * 判断字符串中是否存在4字节字符
     * @param input 输入字符串
     * @return 包含4字节返回true, 否则为false
     */
    public static boolean containsMb4Char(String input) {
        if (input == null) {
            return false;
        }
        byte[] bytes = input.getBytes(UTF8);
        for (int i = 0; i < bytes.length; i++) {
            byte b = bytes[i];
            //four bytes
            if ((b & 0XF0) == 0XF0) {
                return true;
            } else if ((b & 0XE0) == 0XE0) {
                //three bytes
                //forward 2 byte
                i += 2;
            } else if ((b & 0XC0) == 0XC0) {
                i += 1;
            }
        }
        return false;
    }

    /**
     * 替换可能存在的utf8 4字节字符
     * @param input 输入字符串
     * @param replacement 替换为的字符串
     * @return 替换后的utf8字符串
     */
    public static String replaceMb4Char(String input, String replacement) {
        if (input == null) {
            throw new IllegalArgumentException("input can not be null when replaceMb4Char");
        }

        StringBuilder sb = new StringBuilder(input.length());
        byte[] bytes = input.getBytes(UTF8);
        char[] chars = input.toCharArray();
        int charIdx = 0;
        for (int i = 0; i < bytes.length; i++) {
            byte b = bytes[i];
            //four bytes
            if ((b & 0XF0) == 0XF0) {
                sb.append(replacement);
                //utf-8四字节字符unicode后变为2个字符, 故字符下标多加1
                charIdx+=2;
                i+=3;
                continue;
            } else if ((b & 0XE0) == 0XE0) {
                //three bytes
                //forward 2 byte
                i += 2;
            } else if ((b & 0XC0) == 0XC0) {
                i += 1;
            }
            sb.append(chars[charIdx]);
            charIdx++;
        }
        return sb.toString();
    }
}

如下是单元测试代码:

import org.apache.commons.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;
import org.springframework.util.StringUtils;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;

/**
 * Created by zhaoyukai on 2017/9/27.
 */
public class Utf8Mb4Test {
    private final static Charset UTF8 = Charset.forName("UTF-8");

    @Test
    public void testReplacement() {
        String input = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94哈哈哈";
        String output = UTF8MB4.replaceMb4Char(input, "");
        String expect = "A啊中哈哈哈";
        Assert.assertEquals(expect, output);
    }

    @Test
    public void testContainsMb4() {
        testFalse("呵呵呵");
        testFalse("AAAA");
        testFalse(",,,");
        testFalse("中国。。,&………………");
        testFalse("我们mmm他们..你们abcdddd牛m");
    }

    @Test
    public void testReplacePerf() {
        long startMm = System.currentTimeMillis();
        int times = 10000000;
        while (times > 0) {
            UTF8MB4.replaceMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941", "");
            times--;
        }
        long end = System.currentTimeMillis();
        long used = end - startMm;
        System.out.println(used);
    }

    @Test
    public void testContainsPerf() {
        long startMm = System.currentTimeMillis();
        int times = 10000000;
        while (times > 0) {
            UTF8MB4.containsMb4Char("朝阳区和平街胜古东里1号楼4单元60\uD83D\uDC7D\uD83D\uDC941");
            times--;
        }
        long end = System.currentTimeMillis();
        long used = end - startMm;
        System.out.println(used);
    }

    @Test
    public void testContainsMb4True() throws IOException {
        String input = IOUtils.toString(getClass().getClassLoader().getResourceAsStream("contains.txt"), UTF8);
        testTrue(input);
        testTrue("\"A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94\"");
    }

    @Test
    public void testContainsCharsInFile() throws IOException {
        InputStream stream = null;
        try {
            stream = getClass().getClassLoader().getResourceAsStream("contains.txt");

            List<String> lines = IOUtils.readLines(stream, UTF8);
            for (String line : lines) {
                if (StringUtils.isEmpty(line)) {
                    continue;
                }

                char first = line.charAt(0);
                String last = line.substring(1);
                boolean expectContains = first == '1';
                boolean actualContains = UTF8MB4.containsMb4Char(last);
                Assert.assertEquals(String.format("%s is %s but %s", last, expectContains, actualContains),
                        expectContains, actualContains);
            }
        } finally {
            IOUtils.closeQuietly(stream);
        }
    }

    @Test
    public void testReplaceCharsInFile() throws IOException {
        InputStream stream = null;
        try {
            stream = getClass().getClassLoader().getResourceAsStream("replace.txt");

            List<String> lines = IOUtils.readLines(stream, UTF8);

            for (String line : lines) {
                if (StringUtils.isEmpty(line)) {
                    continue;
                }

                int idxEq = line.indexOf('=');

                if (idxEq == -1) {
                    throw new RuntimeException("测试文本错误, 未按=分隔");
                }
                String afterReplace = line.substring(0, idxEq);
                String beforeReplace = line.substring(idxEq+1);
                String real = UTF8MB4.replaceMb4Char(beforeReplace, "");
                String expect = afterReplace;

                Assert.assertEquals(expect, real);
            }
        } finally {
            IOUtils.closeQuietly(stream);
        }
    }

    void testTrue(String input) {
        boolean contains = UTF8MB4.containsMb4Char(input);
        Assert.assertTrue(contains);
    }

    void testFalse(String input) {
        boolean contains = UTF8MB4.containsMb4Char(input);
        Assert.assertFalse(contains);
    }

    @Test
    public void testMb4() {
        String chs = "A啊中\uD83D\uDE00\uD83D\uDC7D\uD83D\uDC94";

//        byte[] bytes = chs.getBytes(UTF8);
        byte[] masks = {
                (byte) 0X00,
                (byte) 0X80,
                (byte) 0XC0,
                (byte) 0XE0,
        };
        char c = 'ﭾ';
        int count = 1;
        System.out.println(String.format("%02X", (int) c));
        while (count < 110000) {
            char nc = (char) ((int) c + count);
            System.out.print(nc);
            byte[] bytes = String.valueOf(nc).getBytes(UTF8);
            System.out.print(String.format("%02X", bytes[0]));
            count++;
            if (count % 50 == 0) {
                System.out.println();
            }
        }
    }
}

标签:java,utf-8

收藏

0人收藏

支持

0

反对

0

相关聚客文章
  1. 博主 发表 2015-05-29 15:39:02 深入Java泛型
  2. 尖兵 发表 2018-09-04 08:08:08 原 荐 再看ThreadLocal
  3. 博主 发表 2015-05-19 14:18:53 使用Java调用Python服务器RPC
  4. 魔术师Carvendy 发表 2018-09-06 05:00:23 《Python3.6官方文档》14 章
  5. jeasonzhao@gmail.com 发表 2014-12-03 01:52:19 PhoneGAP,NodeJS和ADT那一些个破事:1环境搭建
  6. zhuangli 发表 2018-09-08 16:15:32 我的2018秋招总结
  7. 博主 发表 2014-06-19 07:18:58 Spring MVC注解小例子
  8. 博主 发表 2018-09-11 08:42:59 SpringBoot配置属性参数中文说明文档
  9. ChenShan 发表 2015-03-16 16:00:00 Parameterized Test
  10. TiuVe2 发表 2018-09-13 02:12:05 Map 大家族的那点事儿 ( 6 ) :LinkedHashMap
  11. 还如一梦中 发表 2015-06-03 07:41:11 独家一键启动环境Java&Tomcat&PHP&MySQL
  12. songhua.gao 发表 2018-09-16 13:37:57 轻量级应用 Spring 特性

发表评论