In addition to find whole word, my algorithme should take in account the fact that my text is iso-latin encoding (french word), thus it contains some accents caracters.
Take look on this code snipet and i wish you enjoy.
import java.text.Normalizer; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import junit.framework.TestCase; public class TestRegex extends TestCase{ public void testRegex(){ String INPUT = "l'postéEç! toto tata a problème à probleme"; List<String> listCnil = Arrays.asList(new String[]{"l'postéeç!", "a problème"}); String unAccentInput = unAccent(INPUT); Set<String> inputWords = new HashSet<String>(); for (String word : listCnil) { word = unAccent(word); Pattern p = Pattern.compile("\\b\\Q" + word + "\\E\\b", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(unAccentInput); while (m.find()) { String tt = INPUT.substring(m.start(), m.end()); inputWords.add(tt); } } for (String str : inputWords) { Pattern p = Pattern.compile(\\b\\Q"+str+"\\E\\b", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(INPUT); StringBuffer sb = new StringBuffer(); while (m.find()) { String REMP = "<p>"+m.group()+"</p>"; m.appendReplacement(sb, REMP); } m.appendTail(sb); INPUT=sb.toString(); } System.out.println(INPUT); } public static String unAccent(String s) { String temp = Normalizer.normalize(s, Normalizer.Form.NFD); Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); return pattern.matcher(temp).replaceAll(""); } }
Aucun commentaire:
Enregistrer un commentaire