Detecting noun gender in russian language (Определение рода существительного в русском языке)

<?php
 
/* 
 * To detect noun gender, we shall look at it in nominative singular form.
 *
 * Those are the basic textbook rules:
 * - Nouns ending in consonants are always masculine. (good rule)
 * - Nouns ending in "-а", "-я", "-ия" are feminine (minor exceptions exist)
 * - Nouns ending in "-о", "-е", "-ие", "-[м]я" are neutral (good rule)
 * - Nouns ending in "-ь", could be EITHER masculine EITHER feminine (pain in the ass)
 * Hints:
 * - Nouns ending in "-арь", "-тель" relate to masculine gender.
 * - Nouns ending in hissing letters and in "-ь" relate to the feminine gender.
 *
 * Note: this function is written with masculine gender as 'default',
 *  trying to exclude other genders before returning. Therefore, some of
 *  the logic is based on that. That has no relation to russian grammar
 *  and could've been written the other way around.  
 */
function ru_noun_gender($word) {
	$word = mb_ereg_replace ( "~", "" , $word );
	$word = mb_strtolower($word, 'UTF-8');
	$wlen = mb_strlen($word);
 
	/* Masculine "exceptions" */
	static $m_list = array(
		/* Those nouns appear to be feminine, but are not due to physical
	 	* gender ('dad','uncle','young man','man','capitalist','male horse') */
		"папа","дядя","юноша","мужчина","буржуа","конь",
		/* Names of the month are always masculine */
		"февраль","апрель","июнь","июль","сентябрь","октябрь","ноябрь","декабрь",
		/* There are some more, TODO -- UPDATE */
		"день","пень","дурень"
		/* Some foreign nouns don't follow any rules and should be kept here: */ 
	);
 
	/* Feminine "exceptions" */
	static $f_list = array(
		/* Those words could be either masculine either feminine, and generally,
		 * the dictionary should be consulted. Since masculine is our default,
		 * we keep the list of feminine exceptions */
		 "дверь","тетрадь","грязь","тень","смерть","мать","жесть",
		 "моль","топь","соль","боль","роль"
		 /* There are millions more, TODO -- UPDATE */
		 /* Some foreign nouns don't follow any rules and should be kept here: */
	);
 
	/* Neuter "exceptions" */
	static $n_list = array(
		/* Like in some other languages, archaic 'child' is neuter */
		 "дитя",
		 /* There are probably others, TODO -- FIND THEM */
	);
 
	if (!in_array($word, $m_list)) //which are 'm'
	{
		$l1 = mb_substr($word, -1, $wlen, 'UTF-8'); // last letter
		$l2 = mb_substr($word, -2, $wlen, 'UTF-8'); // last 2 letters
 
		if ($l1 == "о" || $l1 == "е" || $l2 == "мя" || in_array($word, $n_list)) return 'n';
		else if ($l1 == "а" || $l1 == "я") return 'f';
		else if ($l1 == "ь") 
		{
			$l3 = mb_substr($word, -3, $wlen, 'UTF-8'); // last 3 letters
			$l4 = mb_substr($word, -4, $wlen, 'UTF-8'); // last 4 letters
 
			/* if the noun ends with suffix "-арь-" or "-тель-" */			
			if ($l3 != "арь" && $l4 != "тель") //which are 'm'
			{
				/* if the noun ends in hissing letters or with suffix "-ость-" */
				if ($l2 == "чь" || $l2 == "шь" || $l2 == "жь" || 
					$l4 == "ость" || in_array($word, $f_list)) return 'f';
				/* if the code ever reaches _this point_, there's a chance
				 * of incorrect detection */
			}
		}
	}	
	return 'm';
}
 
php?>
$words = array("creature"=>"существо", "woman"=>"женщина", "man"=>"мужчина");
foreach ($words as $eng=>$rus) {
	echo $eng . " - " . ru_noun_gender($rus) . " - " . $rus . " <BR>";
}
 
creature - n - существо 
woman - f - женщина
man - m - мужчина