Any simple unicode string like ???
or ????????
matches in c# regex using the following pattern but they don’t match in java.
Can anyone explain this? How do I correct it for it to work in Java?
"\b[\w\p{M}\u200B\u200C\u00AC\u001F\u200D\u200E\u200F]+\b"
c# code :(it matches the strings)
private static readonly Regex s_regexEngine;
private static readonly string s_wordPattern = @"[wp{M}u200Bu200Cu00ACu001Fu200Du200Eu200F]+";
static PersianWordTokenizer()
{
s_regexEngine = new Regex(s_wordPattern, RegexOptions.Multiline);
}
public static List<string> Tokenize(string text, bool removeSeparators, bool standardized)
{
List<string> tokens = new List<string>();
int strIndex = 0;
foreach (Match match in s_regexEngine.Matches(text))
{
//Enter in this block
}
java code:(it dosnt matches string)
private static final String s_wordPattern = "\b[\w\p{M}\u200B\u200C\u00AC\u001F\u200D\u200E\u200F]+\b";
static
{
s_regexpattern = Pattern.compile(Pattern.quote(s_wordPattern));
}
public static java.util.ArrayList<String> Tokenize(String text, boolean removeSeparators, boolean standardized)
{
java.util.ArrayList<String> tokens = new java.util.ArrayList<String>();
int strIndex = 0;
s_regexEngine=s_regexpattern.matcher(text);
while(s_regexEngine.find())
{
// it dosnt enter in this block
}
question from:https://stackoverflow.com/questions/65917997/how-could-i-migrate-this-regex