gif animé ordinateur

dimanche 11 août 2013

Java searching whole word in text

I recently used to face some issues in searching a whole word in given text. So, after googling and testing many regular expression pattern, i finally optimise a java algorithme who full this request.
In addition to find whole word, my algorithme should take in account the fact that my text is iso-latin encoding (french word), thus it contains some accents caracters.
Take look on this code snipet and i wish you enjoy.

import java.text.Normalizer;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.TestCase;

public class TestRegex extends TestCase{
 public void testRegex(){
  String INPUT = "l'postéEç! toto tata a problème à probleme";
  List<String> listCnil = Arrays.asList(new String[]{"l'postéeç!", "a problème"});
  String unAccentInput = unAccent(INPUT);
    
  Set<String> inputWords = new HashSet<String>();
  
  for (String word : listCnil) {
   word = unAccent(word);
   Pattern p = Pattern.compile("\\b\\Q" + word + "\\E\\b", Pattern.CASE_INSENSITIVE);
   Matcher m = p.matcher(unAccentInput);
   
   while (m.find()) {
    String tt = INPUT.substring(m.start(), m.end());
    inputWords.add(tt);
   }
 
  }
  
  for (String str : inputWords) {
   Pattern p = Pattern.compile(\\b\\Q"+str+"\\E\\b", Pattern.CASE_INSENSITIVE);
   Matcher m = p.matcher(INPUT);
   StringBuffer sb = new StringBuffer();
   while (m.find()) {
    String REMP = "<p>"+m.group()+"</p>";
    m.appendReplacement(sb, REMP);
   }
   m.appendTail(sb);
   INPUT=sb.toString();
  }
  System.out.println(INPUT);
 }
 
 
   public static String unAccent(String s) {
       String temp = Normalizer.normalize(s, Normalizer.Form.NFD);
       Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
       return pattern.matcher(temp).replaceAll("");
   }

}