diff --git a/classes/Encoding.php b/classes/Encoding.php new file mode 100644 index 0000000000..5c81a31f98 --- /dev/null +++ b/classes/Encoding.php @@ -0,0 +1,349 @@ + + * @package Encoding + * @version 2.0 + * @link https://github.com/neitanod/forceutf8 + * @example https://github.com/neitanod/forceutf8 + * @license Revised BSD + */ + +class Encoding { + + const ICONV_TRANSLIT = "TRANSLIT"; + const ICONV_IGNORE = "IGNORE"; + const WITHOUT_ICONV = ""; + + protected static $win1252ToUtf8 = array( + 128 => "\xe2\x82\xac", + + 130 => "\xe2\x80\x9a", + 131 => "\xc6\x92", + 132 => "\xe2\x80\x9e", + 133 => "\xe2\x80\xa6", + 134 => "\xe2\x80\xa0", + 135 => "\xe2\x80\xa1", + 136 => "\xcb\x86", + 137 => "\xe2\x80\xb0", + 138 => "\xc5\xa0", + 139 => "\xe2\x80\xb9", + 140 => "\xc5\x92", + + 142 => "\xc5\xbd", + + + 145 => "\xe2\x80\x98", + 146 => "\xe2\x80\x99", + 147 => "\xe2\x80\x9c", + 148 => "\xe2\x80\x9d", + 149 => "\xe2\x80\xa2", + 150 => "\xe2\x80\x93", + 151 => "\xe2\x80\x94", + 152 => "\xcb\x9c", + 153 => "\xe2\x84\xa2", + 154 => "\xc5\xa1", + 155 => "\xe2\x80\xba", + 156 => "\xc5\x93", + + 158 => "\xc5\xbe", + 159 => "\xc5\xb8" + ); + + protected static $brokenUtf8ToUtf8 = array( + "\xc2\x80" => "\xe2\x82\xac", + + "\xc2\x82" => "\xe2\x80\x9a", + "\xc2\x83" => "\xc6\x92", + "\xc2\x84" => "\xe2\x80\x9e", + "\xc2\x85" => "\xe2\x80\xa6", + "\xc2\x86" => "\xe2\x80\xa0", + "\xc2\x87" => "\xe2\x80\xa1", + "\xc2\x88" => "\xcb\x86", + "\xc2\x89" => "\xe2\x80\xb0", + "\xc2\x8a" => "\xc5\xa0", + "\xc2\x8b" => "\xe2\x80\xb9", + "\xc2\x8c" => "\xc5\x92", + + "\xc2\x8e" => "\xc5\xbd", + + + "\xc2\x91" => "\xe2\x80\x98", + "\xc2\x92" => "\xe2\x80\x99", + "\xc2\x93" => "\xe2\x80\x9c", + "\xc2\x94" => "\xe2\x80\x9d", + "\xc2\x95" => "\xe2\x80\xa2", + "\xc2\x96" => "\xe2\x80\x93", + "\xc2\x97" => "\xe2\x80\x94", + "\xc2\x98" => "\xcb\x9c", + "\xc2\x99" => "\xe2\x84\xa2", + "\xc2\x9a" => "\xc5\xa1", + "\xc2\x9b" => "\xe2\x80\xba", + "\xc2\x9c" => "\xc5\x93", + + "\xc2\x9e" => "\xc5\xbe", + "\xc2\x9f" => "\xc5\xb8" + ); + + protected static $utf8ToWin1252 = array( + "\xe2\x82\xac" => "\x80", + + "\xe2\x80\x9a" => "\x82", + "\xc6\x92" => "\x83", + "\xe2\x80\x9e" => "\x84", + "\xe2\x80\xa6" => "\x85", + "\xe2\x80\xa0" => "\x86", + "\xe2\x80\xa1" => "\x87", + "\xcb\x86" => "\x88", + "\xe2\x80\xb0" => "\x89", + "\xc5\xa0" => "\x8a", + "\xe2\x80\xb9" => "\x8b", + "\xc5\x92" => "\x8c", + + "\xc5\xbd" => "\x8e", + + + "\xe2\x80\x98" => "\x91", + "\xe2\x80\x99" => "\x92", + "\xe2\x80\x9c" => "\x93", + "\xe2\x80\x9d" => "\x94", + "\xe2\x80\xa2" => "\x95", + "\xe2\x80\x93" => "\x96", + "\xe2\x80\x94" => "\x97", + "\xcb\x9c" => "\x98", + "\xe2\x84\xa2" => "\x99", + "\xc5\xa1" => "\x9a", + "\xe2\x80\xba" => "\x9b", + "\xc5\x93" => "\x9c", + + "\xc5\xbe" => "\x9e", + "\xc5\xb8" => "\x9f" + ); + + static function toUTF8($text){ + /** + * Function \ForceUTF8\Encoding::toUTF8 + * + * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. + * + * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. + * + * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: + * + * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß + * are followed by any of these: ("group B") + * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ + * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» + * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) + * is also a valid unicode character, and will be left unchanged. + * + * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, + * 3) when any of these: ðñòó are followed by THREE chars from group B. + * + * @name toUTF8 + * @param string $text Any string. + * @return string The same string, UTF8 encoded + * + */ + + if(is_array($text)) + { + foreach($text as $k => $v) + { + $text[$k] = self::toUTF8($v); + } + return $text; + } + + if(!is_string($text)) { + return $text; + } + + $max = self::strlen($text); + + $buf = ""; + for($i = 0; $i < $max; $i++){ + $c1 = $text[$i]; + if($c1>="\xc0"){ //Should be converted to UTF8, if it's not UTF8 already + $c2 = $i+1 >= $max? "\x00" : $text[$i+1]; + $c3 = $i+2 >= $max? "\x00" : $text[$i+2]; + $c4 = $i+3 >= $max? "\x00" : $text[$i+3]; + if($c1 >= "\xc0" & $c1 <= "\xdf"){ //looks like 2 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2; + $i++; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } elseif($c1 >= "\xe0" & $c1 <= "\xef"){ //looks like 3 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } elseif($c1 >= "\xf0" & $c1 <= "\xf7"){ //looks like 4 bytes UTF8 + if($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf"){ //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3 . $c4; + $i = $i + 3; + } else { //not valid UTF8. Convert it. + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = ($c1 & "\x3f") | "\x80"; + $buf .= $cc1 . $cc2; + } + } else { //doesn't look like UTF8, but should be converted + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } elseif(($c1 & "\xc0") === "\x80"){ // needs conversion + if(isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases + $buf .= self::$win1252ToUtf8[ord($c1)]; + } else { + $cc1 = (chr(ord($c1) / 64) | "\xc0"); + $cc2 = (($c1 & "\x3f") | "\x80"); + $buf .= $cc1 . $cc2; + } + } else { // it doesn't need conversion + $buf .= $c1; + } + } + return $buf; + } + + static function toWin1252($text, $option = self::WITHOUT_ICONV) { + if(is_array($text)) { + foreach($text as $k => $v) { + $text[$k] = self::toWin1252($v, $option); + } + return $text; + } elseif(is_string($text)) { + return static::utf8_decode($text, $option); + } else { + return $text; + } + } + + static function toISO8859($text, $option = self::WITHOUT_ICONV) { + return self::toWin1252($text, $option); + } + + static function toLatin1($text, $option = self::WITHOUT_ICONV) { + return self::toWin1252($text, $option); + } + + static function fixUTF8($text, $option = self::WITHOUT_ICONV){ + if(is_array($text)) { + foreach($text as $k => $v) { + $text[$k] = self::fixUTF8($v, $option); + } + return $text; + } + + if(!is_string($text)) { + return $text; + } + + $last = ""; + while($last <> $text){ + $last = $text; + $text = self::toUTF8(static::utf8_decode($text, $option)); + } + $text = self::toUTF8(static::utf8_decode($text, $option)); + return $text; + } + + static function UTF8FixWin1252Chars($text){ + // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1 + // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it. + // See: http://en.wikipedia.org/wiki/Windows-1252 + + return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text); + } + + static function removeBOM($str=""){ + if(substr($str, 0,3) === pack("CCC",0xef,0xbb,0xbf)) { + $str=substr($str, 3); + } + return $str; + } + + protected static function strlen($text){ + return (function_exists('mb_strlen') && ((int) ini_get('mbstring.func_overload')) & 2) ? + mb_strlen($text,'8bit') : strlen($text); + } + + public static function normalizeEncoding($encodingLabel) + { + $encoding = strtoupper($encodingLabel); + $encoding = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding); + $equivalences = array( + 'ISO88591' => 'ISO-8859-1', + 'ISO8859' => 'ISO-8859-1', + 'ISO' => 'ISO-8859-1', + 'LATIN1' => 'ISO-8859-1', + 'LATIN' => 'ISO-8859-1', + 'UTF8' => 'UTF-8', + 'UTF' => 'UTF-8', + 'WIN1252' => 'ISO-8859-1', + 'WINDOWS1252' => 'ISO-8859-1' + ); + + if(empty($equivalences[$encoding])){ + return 'UTF-8'; + } + + return $equivalences[$encoding]; + } + + public static function encode($encodingLabel, $text) + { + $encodingLabel = self::normalizeEncoding($encodingLabel); + if($encodingLabel === 'ISO-8859-1') return self::toLatin1($text); + return self::toUTF8($text); + } + + protected static function utf8_decode($text, $option = self::WITHOUT_ICONV) + { + if ($option == self::WITHOUT_ICONV || !function_exists('iconv')) { + $o = utf8_decode( + str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)) + ); + } else { + $o = iconv("UTF-8", "Windows-1252" . ($option === self::ICONV_TRANSLIT ? '//TRANSLIT' : ($option === self::ICONV_IGNORE ? '//IGNORE' : '')), $text); + } + return $o; + } +} diff --git a/classes/ImageShared.php b/classes/ImageShared.php index fe475d8c48..8523d6b6ec 100644 --- a/classes/ImageShared.php +++ b/classes/ImageShared.php @@ -1122,10 +1122,13 @@ private static function getImgDim2($imgUrl) { $data = curl_exec($curl); curl_close($curl); $width = 0; $height = 0; + $im = @imagecreatefromstring($data); - $width = @imagesx($im); - $height = @imagesy($im); - if($im) imagedestroy($im); + if($im){ + $width = @imagesx($im); + $height = @imagesy($im); + imagedestroy($im); + } if(!$width || !$height) return false; return array($width,$height); } diff --git a/classes/SpecProcessorOcr.php b/classes/SpecProcessorOcr.php index 6bb7be65ff..e4295466f6 100644 --- a/classes/SpecProcessorOcr.php +++ b/classes/SpecProcessorOcr.php @@ -4,6 +4,7 @@ */ include_once($SERVER_ROOT.'/config/dbconnection.php'); include_once($SERVER_ROOT.'/classes/Manager.php'); +include_once($SERVER_ROOT.'/classes/Encoding.php'); class SpecProcessorOcr extends Manager{ @@ -768,7 +769,7 @@ private function setTempPath(){ $tempPath = ini_get('upload_tmp_dir'); } if(!$tempPath){ - $tempPath = $GLOBALS['serverRoot']; + $tempPath = $GLOBALS['SERVER_ROOT']; if(substr($tempPath,-1) != '/') $tempPath .= '/'; $tempPath .= 'temp/'; } @@ -786,38 +787,19 @@ private function setTempPath(){ //Misc functions private function cleanRawStr($inStr){ - $retStr = $this->encodeString($inStr); + $retStr = trim($inStr); + //$retStr = $this->encodeString($retStr); + $retStr = Encoding::toUTF8($retStr); + + $retStr = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+[\s\t]*[\r\n]+/", "\n\n", $retStr); //replace commonly misinterpreted characters $replacements = array("/\." => "A.", "/-\\" => "A", "\X/" => "W", "\Y/" => "W", "`\‘i/" => "W", chr(96) => "'", chr(145) => "'", chr(146) => "'", "�" => "'", "�" => '"', "�" => '"', "�" => '"', chr(147) => '"', chr(148) => '"', chr(152) => '"', chr(239) => "�"); $retStr = str_replace(array_keys($replacements), $replacements, $retStr); - //replace Is, ls and |s in latitudes and longitudes with ones - //replace Os in latitudes and longitudes with zeroes, Ss with 5s and Zs with 2s - //latitudes and longitudes can be of the types: ddd.ddddddd�, ddd� ddd.ddd' or ddd� ddd' ddd.ddd" - $false_num_class = "[OSZl|I!\d]";//the regex class that represents numbers and characters that numbers are commonly replaced with - $preg_replace_callback_pattern = - array( - "/".$false_num_class."{1,3}(\.".$false_num_class."{1,7})\s?".chr(176)."\s?[NSEW(\\\V)(\\\W)]/", - "/".$false_num_class."{1,3}".chr(176)."\s?".$false_num_class."{1,3}(\.".$false_num_class."{1,3})?\s?'\s?[NSEW(\\\V)(\\\W)]/", - "/".$false_num_class."{1,3}".chr(176)."\s?".$false_num_class."{1,3}\s?'\s?(".$false_num_class."{1,3}(\.".$false_num_class."{1,3})?\"\s?)?[NSEW(\\\V)(\\\W)]/" - ); - $retStr = preg_replace_callback($preg_replace_callback_pattern, create_function('$matches','return str_replace(array("l","|","!","I","O","S","Z"), array("1","1","1","1","0","5","2"), $matches[0]);'), $retStr); //replace \V and \W in longitudes and latitudes with W $retStr = preg_replace("/(\d\s?[".chr(176)."'\"])\s?\\\[VW]/", "\${1}W", $retStr, -1); - //replace Zs and zs with 2s, Is, !s, |s and ls with 1s and Os and os with 0s in dates of type Mon(th) DD, YYYY - $retStr = preg_replace_callback( - "/(((?i)January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?|May|June|Jun\.?|July|Jul\.?|August|Aug\.?|September|Sept?\.?|October|Oct\.?|November|Nov\.?|December|Dec\.?)\s)(([\dOIl|!ozZS]{1,2}),?\s)([\dOI|!lozZS]{4})/", - create_function('$matches','return $matches[1].str_replace(array("l","|","!","I","O","o","Z","z","S"), array("1","1","1","1","0","0","2","2","5"), $matches[3]).str_replace(array("l","|","!","I","O","o","Z","z","S"), array("1","1","1","1","0","0","2","2","5"), $matches[5]);'), - $retStr - ); - //replace Zs with 2s, Is with 1s and Os with 0s in dates of type DD-Mon(th)-YYYY or DDMon(th)YYYY or DD Mon(th) YYYY - $retStr = preg_replace_callback( - "/([\dOIl!|ozZS]{1,2}[-\s]?)(((?i)January|Jan\.?|February|Feb\.?|March|Mar\.?|April|Apr\.?|May|June|Jun\.?|July|Jul\.?|August|Aug\.?|September|Sept?\.?|October|Oct\.?|November|Nov\.?|December|Dec\.?)[-\s]?)([\dOIl|!ozZS]{4})/i", - create_function('$matches','return str_replace(array("l","|","!","I","O","o","Z","z","S"), array("1","1","1","1","0","0","2","2","5"), $matches[1]).$matches[2].str_replace(array("l","|","!","I","O","o","Z","z","S"), array("1","1","1","1","0","0","2","2","5"), $matches[4]);'), - $retStr - ); return $retStr; } } diff --git a/classes/TaxonProfile.php b/classes/TaxonProfile.php index 1e53903a0a..bfb39ea1d1 100644 --- a/classes/TaxonProfile.php +++ b/classes/TaxonProfile.php @@ -687,7 +687,10 @@ public function getOccTaxonInDbCnt($limitRank = 170, $collidStr = 'all') { $count = -1; if ($this->rankId >= $limitRank) { - $sql = 'SELECT COUNT(o.occid) as cnt FROM omoccurrences o JOIN (SELECT DISTINCT e.tid, t.sciname FROM taxaenumtree e JOIN taxa t ON e.tid = t.tid WHERE parenttid = '.$this->tid.' OR e.tid = '.$this->tid.') AS parentAndChildren ON o.tidinterpreted = parentAndChildren.tid '; + //$sql = 'SELECT COUNT(o.occid) as cnt FROM omoccurrences o JOIN (SELECT DISTINCT e.tid, t.sciname FROM taxaenumtree e JOIN taxa t ON e.tid = t.tid WHERE parenttid = '.$this->tid.' OR e.tid = '.$this->tid.') AS parentAndChildren ON o.tidinterpreted = parentAndChildren.tid '; + $sql = 'SELECT COUNT(o.occid) as cnt + FROM omoccurrences o JOIN (SELECT DISTINCT ts.tid FROM taxaenumtree e JOIN taxa t ON e.tid = t.tid INNER JOIN taxstatus ts ON e.tid = ts.tidaccepted + WHERE e.parenttid = '.$this->tid.' OR e.tid = '.$this->tid.') AS taxa ON o.tidinterpreted = taxa.tid '; if (preg_match('/^[,\d]+$/',$collidStr)) $sql .= 'AND o.collid IN('.$collidStr.')'; $result = $this->conn->query($sql); while ($row = $result->fetch_object()){ diff --git a/classes/TaxonomyHarvester.php b/classes/TaxonomyHarvester.php index e1fbf75e21..f388d9d24c 100644 --- a/classes/TaxonomyHarvester.php +++ b/classes/TaxonomyHarvester.php @@ -96,6 +96,10 @@ private function addSciname($taxonArr, $resourceKey){ $this->logOrEcho('Checking TROPICOS...',1); $newTid= $this->addTropicosTaxon($taxonArr); } + elseif($resourceKey== 'fdex'){ + $this->logOrEcho('Checking fdex...',1); + $newTid= $this->addFdexTaxon($taxonArr); + } elseif($resourceKey== 'eol'){ $this->logOrEcho('Checking EOL...',1); $newTid= $this->addEolTaxon($taxonArr); @@ -735,25 +739,48 @@ private function getTropicosNode($nodeArr){ return $taxonArr; } - //Index Fungorum functions + //Index Fungorum functions via MyCoPortal FdEx tools //http://www.indexfungorum.org/ixfwebservice/fungus.asmx/NameSearch?SearchText=Acarospora%20socialis&AnywhereInText=false&MaxNumber=10 - private function addIndexFungorumTaxon($taxonArr){ + private function addFdexTaxon($taxonArr){ $sciName = $taxonArr['sciname']; if($sciName){ $adjustedName = $sciName; if(isset($taxonArr['rankid']) && $taxonArr['rankid'] > 220) $adjustedName = trim($taxonArr['unitname1'].' '.$taxonArr['unitname2'].' '.$taxonArr['unitname3']); - $url = 'https://webservice.catalogueoflife.org/col/webservice?response=full&format=json&name='.str_replace(' ','%20',$adjustedName); + $url = 'https://mycoportal.org/fdex/services/api/query.php?qText='.str_replace(' ','%20',$adjustedName).'&qField=taxon'; //echo $url.'
'; $retArr = $this->getContentString($url); $content = $retArr['str']; - $resultArr = json_decode($content,true); - $numResults = $resultArr['number_of_results_returned']; - if($numResults){ - + if($content == '0 results'){ + $this->logOrEcho('Taxon not found',2); + return false; + } + else{ + $resultArr = json_decode($content,true); + $numResults = count($resultArr); + $taxonArr = array(); + if($numResults){ + /* + * return example "taxon" : "Verrucaria microstictica" , "authors" : "Leight." , "mbNumber" : "307221" , "otherID" : "86A6E1F9-AACE-43AF-A466-9427B38788D4" , + * "rank" : "sp." , "rankCode" : "20" , "taxonomicStatus" : "Assumed legitimate" , "currentTaxon" : "Polycoccum microsticticum" , "currentMbNumber" : "307214" , + * "currentOtherID" : "CACE62EC-E136-44D9-B01C-BB36D95E6262" , "currentStatus" : "Stable" , "parentTaxon" : "Verrucaria" , "parentMbNumber" : "5725" , + * "parentOtherID" : "1CB1CC6A-36B9-11D5-9548-00D0592D548C" , "taxonomicAgreement" : "Asynchronous", "recordSource" : "Index Fungorum" + */ + foreach($resultArr as $unitArr){ + $taxonArr['sciname'] = $unitArr['taxon']; + $rankArr = $this->getFdexRank($unitArr['rank'],$unitArr['rankCode']); + } + } + $this->loadNewTaxon($taxonArr); } } } + private function getFdexRank($rankStr, $rankCode){ + $retArr = array(); + + return $retArr; + } + //EOL functions private function addEolTaxon($taxonArr){ //Returns content for accepted name diff --git a/classes/WordCloud.php b/classes/WordCloud.php index f92e77aa0e..1f7018593e 100644 --- a/classes/WordCloud.php +++ b/classes/WordCloud.php @@ -17,12 +17,12 @@ class WordCloud{ private $supportUtf8 = true; public function __construct(){ - $this->conn = MySQLiConnectionFactory::getCon("readonly"); + $this->conn = MySQLiConnectionFactory::getCon('readonly'); - $this->displayedWordCount = 100; + $this->displayedWordCount = 150; if($GLOBALS['charset'] == 'ISO-8859-1') $this->supportUtf8 = false; - $this->tagUrl = "http://www.google.com/search?hl=en&q="; - + //$this->tagUrl = "https://www.google.com/search?hl=en&q="; + $this->tagUrl = $GLOBALS['CLIENT_ROOT'].'/collections/editor/occurrencetabledisplay.php?occindex=0&reset=1&q_processingstatus=unprocessed'; $this->backgroundColor = "#000"; $this->wordColors[0] = "#5122CC"; $this->wordColors[1] = "#229926"; @@ -35,104 +35,116 @@ public function __construct(){ $this->wordColors[8] = "#229938"; $this->wordColors[9] = "#419922"; - $commonWordStr = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,arent," . - "as,at,be,because,been,but,by,can,cant,cannot,could,couldve,couldnt,dear,did,didnt,do,does,doesnt," . - "dont,either,else,ever,every,for,from,get,got,had,has,hasnt,have,he,her,him,his,how,however," . - "i,if,in,into,is,isnt,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off," . - "often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that," . - "the,their,them,then,there,theres,these,they,this,to,too,us,wants,was,wasnt,we,were,werent,what," . - "when,when,where,which,while,who,whom,why,will,with,wont,would,wouldve,wouldnt,yet,you,your"; - //$commonWordStr = strtolower($commonWordStr); - $this->commonWordArr = explode(",", $commonWordStr); + $commonWordStr = 'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,arent,as,at,be,because,been,but,by,can,cant,cannot,could,couldve,couldnt,dear,did,didnt,do,'. + 'does,doesnt,dont,either,else,ever,every,for,from,get,got,had,has,hasnt,have,he,her,him,his,how,however,i,if,in,into,is,isnt,it,its,just,least,let,like,likely,may,me,might,'. + 'most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,theres,these,they,this.'. + 'to,too,us,wants,was,wasnt,we,were,werent,what,when,when,where,which,while,who,whom,why,will,with,wont,would,wouldve,wouldnt,yet,you,your'; + $this->commonWordArr = explode(',', $commonWordStr); } public function __destruct(){ if(!($this->conn === null)) $this->conn->close(); } - public function buildWordFile($collectionId = 0,$csMode = 0){ - $collArr = array(); - $sqlFrag = 'FROM omoccurrences o INNER JOIN images i ON o.occid = i.occid '. - 'INNER JOIN specprocessorrawlabels r ON i.imgid = r.imgid '; - if($csMode){ - $sqlFrag .= 'INNER JOIN omcrowdsourcequeue q ON o.occid = q.occid '; + public function batchBuildWordClouds($csMode = 0){ + $processingArr = array(); + $sql = 'SELECT DISTINCT c.collid FROM omoccurrences o INNER JOIN images i ON o.occid = i.occid INNER JOIN specprocessorrawlabels r ON i.imgid = r.imgid '; + if($csMode) $sql .= 'INNER JOIN omcrowdsourcequeue q ON o.occid = q.occid '; + $sql .= 'WHERE o.processingstatus = "unprocessed" AND o.locality IS NULL '; + $rs = $this->conn->query($sql); + while($r = $rs->fetch_object()){ + $processingArr[] = $r->collid; } - $sqlColl = 'SELECT DISTINCT c.collid, c.collectionname '.$sqlFrag. - 'INNER JOIN omcollections c ON c.collid = o.collid '; - if($collectionId){ - $sqlColl .= 'WHERE c.collid = '.$collectionId; + $rs->free(); + foreach($processingArr as $collid){ + $this->buildWordCloud($collid, $csMode); } - //echo 'sql: '.$sqlColl; - $rsColl = $this->conn->query($sqlColl); - while($rColl = $rsColl->fetch_object()){ - $collArr[$rColl->collid] = $rColl->collectionname; + } + + public function buildWordCloud($collid, $csMode = 0){ + $retPath = ''; + //Reset frequency array + unset($this->frequencyArr); + $this->frequencyArr = array(); + $this->tagUrl .= '&collid='.$collid.'&q_customfield1=ocrFragment&q_customtype1=LIKE&q_customvalue1='; + $sql = 'SELECT DISTINCT r.rawstr FROM omoccurrences o INNER JOIN images i ON o.occid = i.occid INNER JOIN specprocessorrawlabels r ON i.imgid = r.imgid '; + if($csMode) $sql .= 'INNER JOIN omcrowdsourcequeue q ON o.occid = q.occid '; + $sql .= 'WHERE o.processingstatus = "unprocessed" AND o.locality IS NULL '; + if($collid) $sql .= 'AND o.collid = '.$collid; + //echo $sql; exit; + //Process all raw OCR strings for collection + $rs = $this->conn->query($sql); + while($r = $rs->fetch_object()){ + $this->addTagsFromText($r->rawstr); } - $rsColl->free(); - - $sql = 'SELECT DISTINCT r.rawstr '.$sqlFrag. - 'WHERE o.processingstatus = "unprocessed" AND o.locality IS NULL '; - foreach($collArr as $collid => $collName){ - //Reset frequency array - unset($this->frequencyArr); - $this->frequencyArr = array(); - //Process all raw OCR strings for collection - $sql .= 'AND o.collid = '.$collid; - $rs = $this->conn->query($sql); - while($r = $rs->fetch_object()){ - $this->addTagsFromText($r->rawstr); - } - $rs->free(); - //Get Word cloud - $cloudStr = $this->getWordCloud(); - echo $cloudStr.'

'; - //Write word out to text file - $wcPath = $GLOBALS['serverRoot']; - if(substr($wcPath,-1) != '/' && substr($wcPath,-1) != "\\") $wcPath .= '/'; - $wcPath .= 'content/collections/wordclouds/ocrcloud'.$collid.'.html'; - if(file_exists($wcPath)){ - $wcFH = fopen($wcPath, 'a'); - if(!$wcFH = fopen($wcPath, 'a')) { - echo "Cannot open file ($wcPath)"; - exit; - } - if(fwrite($wcFH, $cloudStr) === FALSE) { - echo "Cannot write to file ($wcPath)"; - exit; - } - fclose($handle); - } - else{ - echo 'ERROR trying to write word cloud to temp folder: '.$wcPath; - echo '
Is the symbiota temp folder writable to Apache?'; + $rs->free(); + //Get Word cloud + $cloudStr = $this->getWordCloud(); + if(!$cloudStr){ + echo '
No phrases created
sql: '.$sql.'
'; + exit; + } + //echo $cloudStr.'

'; + //$cloudHtml = $this->getCloudHtmlWrapper($cloudStr); + $serverRoot = $GLOBALS['SERVER_ROOT']; + if(substr($serverRoot,-1) != '/') $serverRoot .= '/'; + $wcPath = 'content/collections/wordclouds/ocrcloud_'.$collid.($csMode?'_cs':'').'.php'; + if($wcFH = fopen($serverRoot.$wcPath, 'w')){ + if(fwrite($wcFH, $cloudStr) === FALSE) { + echo 'Cannot write to file ('.$wcPath.')'; + exit; } + fclose($wcFH); + $clientRoot = $GLOBALS['CLIENT_ROOT']; + if(substr($clientRoot,-1) != '/') $clientRoot .= '/'; + $retPath = $clientRoot.$wcPath; + } + else{ + echo 'Cannot open file for writing ('.$wcPath.')'; + exit; } + return $retPath; } - public function addTagsFromText($seedText){ + private function addTagsFromText($seedText){ //$text = strtolower($seedText); //$text = strip_tags($text); - /* remove punctuation and newlines */ - if ($this->supportUtf8){ - $seedText = preg_replace('/[^\p{L}0-9\s]|\n|\r/u',' ',$seedText); - } - else{ - $seedText = preg_replace('/[^a-zA-Z0-9\s]|\n|\r/',' ',$seedText); - } + $seedText = preg_replace('/[;,\r\t]/',"\n",$seedText); + //$seedText = preg_replace('/[^\p{L}0-9\s.-]|\n|\r/u',' ',$seedText); /* remove extra spaces created */ - $seedText = preg_replace('/\s+/',' ',$seedText); - $seedText = trim($seedText); + $seedText = preg_replace('/\s+/',' ',trim($seedText)); - //Remove common words - $wordArr = array_diff(explode(" ", $seedText),$this->commonWordArr); + //$wordArr = array_diff(explode(" ", $seedText),$this->commonWordArr); + $phraseArr = explode("\n", $seedText); - foreach ($wordArr as $key => $value){ - $this->addTag($value); + foreach($phraseArr as $phrase){ + $tag = ''; + $wordCnt = 0; + foreach(explode(' ',$phrase) as $word){ + if($this->keepWord($word) && $wordCnt < 3){ + $tag .= $word.' '; + $wordCnt++; + } + elseif($tag){ + if($wordCnt > 1) $this->addTag(trim($tag,' .')); + $tag = ''; + $wordCnt = 0; + } + } + //if(strlen($value) > 3 && !is_numeric($value)) $this->addTag($value); } } - public function addTag($tag, $useCount = 1){ + private function keepWord($word){ + if(strlen($word) < 3) return false; + if(!preg_match('/^[A-Z]{1}[A-Za-z.]+/',$word)) return false; + if(in_array(strtolower($word),$this->commonWordArr)) return false; + return true; + } + + private function addTag($tag, $useCount = 1){ //$tag = strtolower($tag); if (array_key_exists($tag, $this->frequencyArr)){ $this->frequencyArr[$tag] += $useCount; @@ -142,32 +154,55 @@ public function addTag($tag, $useCount = 1){ } } - public function getWordCloud(){ - $retStr = '
cloudWidth. ";") : "") . - 'line-height:normal">
backgroundImage ."');") : "") . - 'border-color:#888;margin-top:20px;margin-bottom:10px;padding:5px 5px 20px 5px;background-color:'.$this->backgroundColor.';">'; + private function getWordCloud(){ + $retStr = ''; if($this->frequencyArr){ + $retStr = '
cloudWidth. ";") : "") . + 'line-height:normal">
backgroundImage ."');") : "") . + 'border-color:#888;margin-top:20px;margin-bottom:10px;padding:5px 5px 20px 5px;background-color:'.$this->backgroundColor.';">'; arsort($this->frequencyArr); $topTags = array_slice($this->frequencyArr, 0, $this->displayedWordCount); /* randomize the order of elements */ - uasort($topTags, 'randomSort'); + uasort($topTags, function ($a, $b){ return rand(-1, 1); }); $maxCount = max($this->frequencyArr); foreach ($topTags as $tag => $useCount){ $grade = $this->gradeFrequency(($useCount * 100) / $maxCount); - $retStr .= (''. + $retStr .= (''. ''.$tag.' '); } - $retStr .= '

'; + $retStr .= '
'; + $retStr .= '
Created on '.date('Y-m-d H:i:s').'
'; + $retStr .= '
'; } return $retStr; } + private function getCloudHtmlWrapper($cloudStr){ + $htmlStr = ' + + + '.$GLOBALS['DEFAULT_TITLE'].' - Word Cloud '; + $htmlStr .= ''; + $htmlStr .= ' + + + +
'; + $htmlStr .= $cloudStr; + $htmlStr .= '
+ +'; + return $htmlStr; + } + private function gradeFrequency($frequency){ $grade = 0; if ($frequency >= 90) @@ -223,10 +258,4 @@ public function setTextColors($colors){ } } } - -/* array sort helper function */ -function randomSort($a, $b) -{ - return rand(-1, 1); -} -?> +?> \ No newline at end of file diff --git a/collections/specprocessor/wordcloudhandler.php b/collections/specprocessor/wordcloudhandler.php index 055e661829..4f97398ae2 100644 --- a/collections/specprocessor/wordcloudhandler.php +++ b/collections/specprocessor/wordcloudhandler.php @@ -3,11 +3,8 @@ include_once($SERVER_ROOT.'/classes/WordCloud.php'); header("Content-Type: text/html; charset=".$CHARSET); -$collTarget = array_key_exists("colltarget",$_REQUEST)?$_REQUEST["colltarget"]:5; - -$cloudHandler = new WordCloud(); -$cloudHandler->setWidth(800); -$cloudHandler->buildWordFile($collTarget); +$collidStr = array_key_exists('collidstr',$_REQUEST)?$_REQUEST['collidstr']:false; +$csMode = array_key_exists('csmode',$_REQUEST)?$_REQUEST['csmode']:false; ?> @@ -16,26 +13,28 @@ <?php echo $DEFAULT_TITLE; ?> - Word Cloud Handler Collections '; - echo ''; - echo ''; - } + include_once($SERVER_ROOT.'/includes/head.php'); ?> +
Cloud'.$collTarget.''; + $cloudHandler = new WordCloud(); + $cloudHandler->setWidth(800); + if(is_numeric($collidStr)){ + $collidArr = explode(',',$collidStr); + foreach($collidArr as $collid){ + $url = $cloudHandler->buildWordCloud($collid,$csMode); + echo '
'.$url.'
'; + } + } + else echo '
No collid target submitted
'; ?>