Implément procédural et rapide:
/// <summary>
/// Get the words in a code <paramref name="identifier"/>.
/// </summary>
/// <param name="identifier">The code <paramref name="identifier"/></param> to extract words from.
public static string[] GetWords(this string identifier) {
Contract.Ensures(Contract.Result<string[]>() != null, "returned array of string is not null but can be empty");
if (identifier == null) { return new string[0]; }
if (identifier.Length == 0) { return new string[0]; }
const int MIN_WORD_LENGTH = 2; // Ignore one letter or one digit words
var length = identifier.Length;
var list = new List<string>(1 + length/2); // Set capacity, not possible more words since we discard one char words
var sb = new StringBuilder();
CharKind cKindCurrent = GetCharKind(identifier[0]); // length is not zero here
CharKind cKindNext = length == 1 ? CharKind.End : GetCharKind(identifier[1]);
for (var i = 0; i < length; i++) {
var c = identifier[i];
CharKind cKindNextNext = (i >= length - 2) ? CharKind.End : GetCharKind(identifier[i + 2]);
// Process cKindCurrent
switch (cKindCurrent) {
case CharKind.Digit:
case CharKind.LowerCaseLetter:
sb.Append(c); // Append digit or lowerCaseLetter to sb
if (cKindNext == CharKind.UpperCaseLetter) {
goto TURN_SB_INTO_WORD; // Finish word if next char is upper
}
goto CHAR_PROCESSED;
case CharKind.Other:
goto TURN_SB_INTO_WORD;
default: // charCurrent is never Start or End
Debug.Assert(cKindCurrent == CharKind.UpperCaseLetter);
break;
}
// Here cKindCurrent is UpperCaseLetter
// Append UpperCaseLetter to sb anyway
sb.Append(c);
switch (cKindNext) {
default:
goto CHAR_PROCESSED;
case CharKind.UpperCaseLetter:
// "SimpleHTTPServer" when we are at 'P' we need to see that NextNext is 'e' to get the word!
if (cKindNextNext == CharKind.LowerCaseLetter) {
goto TURN_SB_INTO_WORD;
}
goto CHAR_PROCESSED;
case CharKind.End:
case CharKind.Other:
break; // goto TURN_SB_INTO_WORD;
}
//------------------------------------------------
TURN_SB_INTO_WORD:
string word = sb.ToString();
sb.Length = 0;
if (word.Length >= MIN_WORD_LENGTH) {
list.Add(word);
}
CHAR_PROCESSED:
// Shift left for next iteration!
cKindCurrent = cKindNext;
cKindNext = cKindNextNext;
}
string lastWord = sb.ToString();
if (lastWord.Length >= MIN_WORD_LENGTH) {
list.Add(lastWord);
}
return list.ToArray();
}
private static CharKind GetCharKind(char c) {
if (char.IsDigit(c)) { return CharKind.Digit; }
if (char.IsLetter(c)) {
if (char.IsUpper(c)) { return CharKind.UpperCaseLetter; }
Debug.Assert(char.IsLower(c));
return CharKind.LowerCaseLetter;
}
return CharKind.Other;
}
enum CharKind {
End, // For end of string
Digit,
UpperCaseLetter,
LowerCaseLetter,
Other
}
Tests:
[TestCase((string)null, "")]
[TestCase("", "")]
// Ignore one letter or one digit words
[TestCase("A", "")]
[TestCase("4", "")]
[TestCase("_", "")]
[TestCase("Word_m_Field", "Word Field")]
[TestCase("Word_4_Field", "Word Field")]
[TestCase("a4", "a4")]
[TestCase("ABC", "ABC")]
[TestCase("abc", "abc")]
[TestCase("AbCd", "Ab Cd")]
[TestCase("AbcCde", "Abc Cde")]
[TestCase("ABCCde", "ABC Cde")]
[TestCase("Abc42Cde", "Abc42 Cde")]
[TestCase("Abc42cde", "Abc42cde")]
[TestCase("ABC42Cde", "ABC42 Cde")]
[TestCase("42ABC", "42 ABC")]
[TestCase("42abc", "42abc")]
[TestCase("abc_cde", "abc cde")]
[TestCase("Abc_Cde", "Abc Cde")]
[TestCase("_Abc__Cde_", "Abc Cde")]
[TestCase("ABC_CDE_FGH", "ABC CDE FGH")]
[TestCase("ABC CDE FGH", "ABC CDE FGH")] // Should not happend (white char) anything that is not a letter/digit/'_' is considered as a separator
[TestCase("ABC,CDE;FGH", "ABC CDE FGH")] // Should not happend (,;) anything that is not a letter/digit/'_' is considered as a separator
[TestCase("abc<cde", "abc cde")]
[TestCase("abc<>cde", "abc cde")]
[TestCase("abc<D>cde", "abc cde")] // Ignore one letter or one digit words
[TestCase("abc<Da>cde", "abc Da cde")]
[TestCase("abc<cde>", "abc cde")]
[TestCase("SimpleHTTPServer", "Simple HTTP Server")]
[TestCase("SimpleHTTPS2erver", "Simple HTTPS2erver")]
[TestCase("camelCase", "camel Case")]
[TestCase("m_Field", "Field")]
[TestCase("mm_Field", "mm Field")]
public void Test_GetWords(string identifier, string expectedWordsStr) {
var expectedWords = expectedWordsStr.Split(' ');
if (identifier == null || identifier.Length <= 1) {
expectedWords = new string[0];
}
var words = identifier.GetWords();
Assert.IsTrue(words.SequenceEqual(expectedWords));
}