function addMOProperty($unifiedName, $client) { $mysqli = new mysqli(HOSTNAME, USERNAME, PASSWD, DATABASE); if ($mysqli->connect_errno) { die("error: " . $mysqli->connect_error); } $localTable = "MOSpacePeople"; $parser = new HumanNameParser_Parser($unifiedName); $last = $parser->getLast(); $first = $parser->getFirst(); //@Debug //echo "Checking ".$last.", ".$first."....."; $q = "SELECT * from " . $localTable . " where firstname='" . $mysqli->real_escape_string($first) . "' and lastname='" . $mysqli->real_escape_string($last) . "'"; $fromMU = 0; $isProfessor = 0; if ($result = $mysqli->query($q)) { if ($result->num_rows == 1) { //@Debug //echo "found in local database....."; $fromMU = 1; if ($resAssocArray = $result->fetch_assoc()) { if (isset($resAssocArray['Title'])) { $isProfessor = findIfProfessor($resAssocArray['Title']) == 1 ? 1 : 0; //@Debug //if($isProfessor == 1) echo "is a prof\n"; else echo "NOT a prof\n"; } else { $isProfessor = 0; //@Debu //echo "NOT a prof\n"; } } else { die("fetch result from MOSpacePeople failed"); } } else { $peopleFinderURL = "https://webservices.doit.missouri.edu/peoplefinderWS/peoplefinderws.asmx/PeopleFinderXml?firstName=" . urlencode($first) . "&lastname=" . urlencode($last) . "&department=&phoneno=&email="; $url_parser = new URLParser($peopleFinderURL); $retArr = $url_parser->XMLToArray(); if (intval($retArr['@attributes']['found']) == 1) { //@Debug //echo "found in Peoplefinder..."; $fromMU = 1; $title = array_key_exists("Title", $retArr['Person']) && !empty($retArr['Person']['Title']) ? $retArr['Person']['Title'] : ""; $isProfessor = findIfProfessor($title) == 1 ? 1 : 0; //@Debug //if($isProfessor == 1) echo "is a prof\n"; else echo "NOT a prof\n"; } else { $isProfessor = 0; //@Debug //echo "NOT a prof\n"; } } } else { die("query: " . $q . "\nFailed"); } $q_str = "match (u:Person {name: \"" . $unifiedName . "\"}) set u.fromMU = " . $fromMU . ", u.isProfessor = " . $isProfessor; $query = new Query($client, $q_str); $client->executeCypherQuery($query); }
function IEEEHandler($query_str, $sphinxPort, $numOfResults = 10) { $table_name = "ieee"; $mysqli = sphinxDB_connect($sphinxPort); $IEEE_url_prefix = "http://ieeexplore.ieee.org/gateway/ipsSearch.jsp?hc=" . $numOfResults . "&ti="; $url_parser = new URLParser($IEEE_url_prefix . urlencode($query_str)); $IEEE_item_array = $url_parser->XMLToArray(); $ins_qry = "INSERT INTO {$table_name} (id,title, authors, pubtitle, pubtype, volume, issue, abstract, affiliation, issn, mdurl, pdf) VALUES"; if (array_key_exists('document', $IEEE_item_array) && !empty($IEEE_item_array['document'])) { $IEEE_item_array['document'] = oneRecordArrayFormation($IEEE_item_array['document']); $id = 1; foreach ($IEEE_item_array['document'] as $ind => $attr_array) { $title = array_key_exists('title', $attr_array) ? $attr_array['title'] : ""; $authors = array_key_exists('authors', $attr_array) ? $attr_array['authors'] : ""; if (!is_string($authors) || empty($authors)) { continue; } $pubtitle = array_key_exists('pubtitle', $attr_array) ? $attr_array['pubtitle'] : ""; $pubtype = array_key_exists('pubtype', $attr_array) ? $attr_array['pubtype'] : ""; $volume = array_key_exists('volume', $attr_array) ? $attr_array['volume'] : ""; $issue = array_key_exists('issue', $attr_array) ? $attr_array['issue'] : ""; $abstract = array_key_exists('abstract', $attr_array) ? $attr_array['abstract'] : ""; if ($abstract != strip_tags($abstract)) { $abstract = ""; } $affiliation = array_key_exists('affiliation', $attr_array) ? $attr_array['affiliation'] : ""; $issn = array_key_exists('issn', $attr_array) ? $attr_array['issn'] : ""; $mdurl = array_key_exists('mdurl', $attr_array) ? $attr_array['mdurl'] : ""; $pdf = array_key_exists('pdf', $attr_array) ? $attr_array['pdf'] : ""; $title = $mysqli->real_escape_string($title); $authors = $mysqli->real_escape_string($authors); $pubtitle = $mysqli->real_escape_string($pubtitle); $pubtype = $mysqli->real_escape_string($pubtype); $volume = $mysqli->real_escape_string($volume); $issue = $mysqli->real_escape_string($issue); $abstract = $mysqli->real_escape_string($abstract); $affiliation = $mysqli->real_escape_string($affiliation); $issn = $mysqli->real_escape_string($issn); $mdurl = $mysqli->real_escape_string($mdurl); $pdf = $mysqli->real_escape_string($pdf); $ins_qry .= "({$id}, '{$title}', '{$authors}','{$pubtitle}','{$pubtype}','{$volume}','{$issue}','{$abstract}', '{$affiliation}', '{$issn}','{$mdurl}','{$pdf}'),"; ++$id; } $ins_qry = rtrim($ins_qry, ","); if (!$mysqli->query("TRUNCATE RTINDEX " . $table_name)) { exit("Error truncate: " . $mysqli->error); } if (!$mysqli->query($ins_qry)) { exit("Error insert: " . $mysqli->error); } //Rerank the result $rank_query = "SELECT *,weight() AS weight FROM {$table_name} where MATCH('{$query_str}') LIMIT 0,1000 OPTION ranker=MATCHANY;"; if (!($ranked_result = $mysqli->query($rank_query))) { exit("Error rank: " . $mysqli->error); } //Fetch the re-ranked result while ($row = $ranked_result->fetch_assoc()) { $IEEEResultArray[] = array("title" => $row['title'], "authors" => $row['authors'], "pubtitle" => $row['pubtitle'], "pubtyle" => $row['pubtype'], "volume" => $row['volume'], "issue" => $row['issue'], "abstract" => $row['abstract'], "issn" => $row['issn'], "mdurl" => $row['mdurl'], "pdf" => $row['pdf']); } return $IEEEResultArray; } }
echo "is a prof\n"; } else { echo "NOT a prof\n"; } } else { $isProfessor = 0; //@Debu echo "NOT a prof\n"; } } else { die("fetch result from MOSpacePeople failed"); } } else { $peopleFinderURL = "https://webservices.doit.missouri.edu/peoplefinderWS/peoplefinderws.asmx/PeopleFinderXml?firstName=" . urlencode($first) . "&lastname=" . urlencode($last) . "&department=&phoneno=&email="; $url_parser = new URLParser($peopleFinderURL); $retArr = $url_parser->XMLToArray(); if (intval($retArr['@attributes']['found']) == 1) { //@Debug echo "found in Peoplefinder..."; $fromMU = 1; $title = array_key_exists("Title", $retArr['Person']) && !empty($retArr['Person']['Title']) ? $retArr['Person']['Title'] : ""; $isProfessor = findIfProfessor($title) == 1 ? 1 : 0; //@Debug if ($isProfessor == 1) { echo "is a prof\n"; } else { echo "NOT a prof\n"; } } else { $isProfessor = 0; //@Debug
function pubmedHandler($query_str, $sphinxPort, $numOfResults = 10) { $table_name = "pubmed"; $mysqli = sphinxDB_connect($sphinxPort); $pubmedSearchURLPrefix = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?RetMax=" . $numOfResults . "&db=pubmed&term="; $url_parser = new URLParser($pubmedSearchURLPrefix . urlencode($query_str)); $pubmedSearchResultArray = $url_parser->XMLToArray(); if (array_key_exists('Count', $pubmedSearchResultArray) && $pubmedSearchResultArray['Count'] > 0) { //if pubmed search returns results if ($pubmedSearchResultArray['Count'] == 1 || $pubmedSearchResultArray['RetMax'] == 1) { $pubmedSearchResultArray['IdList']['Id'] = array($pubmedSearchResultArray['IdList']['Id']); } $pubmedFetchURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id="; foreach ($pubmedSearchResultArray['IdList']['Id'] as $ind => $pubid) { $pubmedFetchURL .= $pubid . ","; } $pubmedFetchURL = rtrim($pubmedFetchURL, ","); $pubmedFetchXML = file_get_contents($pubmedFetchURL); if ($pubmedFetchXML == FALSE) { die("file_get_contents pubmed fetch failed"); } $pubmedFetchResult = simplexml_load_string($pubmedFetchXML); $pubmedFetchResultArray = json_decode(json_encode($pubmedFetchResult), TRUE); if (!array_key_exists('PubmedArticle', $pubmedFetchResultArray)) { trigger_error("PubmedArticle not set in pubmedFetchResultArray", E_USER_ERROR); return FALSE; } $pubmedFetchResultArray['PubmedArticle'] = isset($pubmedFetchResultArray['PubmedArticle'][0]) ? $pubmedFetchResultArray['PubmedArticle'] : array($pubmedFetchResultArray['PubmedArticle']); $id = 1; $ins_qry = "INSERT INTO {$table_name} (id,pubid,title,authors,abstract,url,date,keywords,affiliations) VALUES"; foreach ($pubmedFetchResultArray['PubmedArticle'] as $record) { if (!array_key_exists('MedlineCitation', $record)) { trigger_error("MedlineCitation not set in one pubmed record", E_USER_ERROR); return FALSE; } $pubid = $record['MedlineCitation']['PMID']; $title = $record['MedlineCitation']['Article']['ArticleTitle']; $abstract = ""; if (array_key_exists("Abstract", $record['MedlineCitation']['Article']) && array_key_exists("AbstractText", $record['MedlineCitation']['Article']['Abstract'])) { $record['MedlineCitation']['Article']['Abstract']['AbstractText'] = is_array($record['MedlineCitation']['Article']['Abstract']['AbstractText']) ? $record['MedlineCitation']['Article']['Abstract']['AbstractText'] : array($record['MedlineCitation']['Article']['Abstract']['AbstractText']); foreach ($record['MedlineCitation']['Article']['Abstract']['AbstractText'] as $ab) { if (is_string($ab)) { $abstract .= $ab . " "; } } } $authors = ""; $affiliations = ""; if (array_key_exists('AuthorList', $record['MedlineCitation']['Article']) && array_key_exists('Author', $record['MedlineCitation']['Article']['AuthorList']) && !empty($record['MedlineCitation']['Article']['AuthorList'])) { $record['MedlineCitation']['Article']['AuthorList']['Author'] = isset($record['MedlineCitation']['Article']['AuthorList']['Author'][0]) ? $record['MedlineCitation']['Article']['AuthorList']['Author'] : array($record['MedlineCitation']['Article']['AuthorList']['Author']); foreach ($record['MedlineCitation']['Article']['AuthorList']['Author'] as $a) { if (array_key_exists('LastName', $a) && array_key_exists('ForeName', $a)) { $authors .= $a['LastName'] . ", " . $a['ForeName'] . " | "; if (array_key_exists('AffiliationInfo', $a) && !empty($a['AffiliationInfo'])) { $a['AffiliationInfo'] = isset($a['AffiliationInfo'][0]) ? $a['AffiliationInfo'] : array($a['AffiliationInfo']); foreach ($a['AffiliationInfo'] as $af) { $affiliations .= $af['Affiliation'] . ";"; } $affiliations = rtrim($affiliations, ";"); $affiliations .= " | "; } else { $affiliations .= "NULL" . " | "; } } } $authors = rtrim($authors, "|"); $affiliations = rtrim($affiliations, "|"); } $keywords = ""; if (array_key_exists("KeywordList", $record['MedlineCitation']) && !empty($record['MedlineCitation']['KeywordList'])) { $record['MedlineCitation']['KeywordList']['Keyword'] = isset($record['MedlineCitation']['KeywordList']['Keyword'][0]) ? $record['MedlineCitation']['KeywordList']['Keyword'] : array($record['MedlineCitation']['KeywordList']['Keyword']); foreach ($record['MedlineCitation']['KeywordList']['Keyword'] as $k) { $keywords .= $k . " | "; } $keywords = rtrim($keywords, "|"); } $date = ""; if (array_key_exists("ArticleDate", $record['MedlineCitation']['Article']) && !empty($record['MedlineCitation']['Article']['ArticleDate'])) { $date = $record['MedlineCitation']['Article']['ArticleDate']['Year'] . "-" . $record['MedlineCitation']['Article']['ArticleDate']['Month'] . "-" . $record['MedlineCitation']['Article']['ArticleDate']['Day']; } $url = 'http://www.ncbi.nlm.nih.gov/pubmed/' . $pubid; $title = $mysqli->real_escape_string($title); $abstract = $mysqli->real_escape_string($abstract); $authors = $mysqli->real_escape_string($authors); $affiliations = $mysqli->real_escape_string($affiliations); $keywords = $mysqli->real_escape_string($keywords); $ins_qry .= "({$id}, {$pubid}, '{$title}', '{$authors}', '{$abstract}', '{$url}', '{$date}', '{$keywords}', '{$affiliations}'),"; ++$id; } $ins_qry = rtrim($ins_qry, ","); if (!$mysqli->query("TRUNCATE RTINDEX " . $table_name)) { exit("Error truncate: " . $mysqli->error); } if (!$mysqli->query($ins_qry)) { exit("Error insert: " . $mysqli->error); } //Rerank the result $rank_query = "SELECT *,weight() AS weight FROM {$table_name} where MATCH('{$query_str}') LIMIT 0,1000 OPTION ranker=MATCHANY;"; if (!($ranked_result = $mysqli->query($rank_query))) { exit("Error rank: " . $mysqli->error); } //Fetch the re-ranked result while ($row = $ranked_result->fetch_assoc()) { $pubmedResultArray[] = array("pubid" => $row['pubid'], "title" => $row['title'], "abstract" => $row['abstract'], "authors" => $row['authors'], "keywords" => $row['keywords'], "date" => $row['date'], "url" => $row['url'], "affiliations" => $row['affiliations']); } return $pubmedResultArray; } return FALSE; }
public function crawlPubmed() { $search_url = $this->pubmed_search_url . "&term=" . urlencode($this->firstName . " " . $this->lastName . "[Author]" . " AND " . $this->affiliation . "[Affiliation]"); $search_url_parser = new URLParser($search_url); $pubmedSearch_item_array = $search_url_parser->XMLToArray(); $returnVal = array(); if (array_key_exists('Count', $pubmedSearch_item_array) && $pubmedSearch_item_array['Count'] > 0) { if ($pubmedSearch_item_array['Count'] == 1 || $pubmedSearch_item_array['RetMax'] == 1) { $pubmedSearch_item_array['IdList']['Id'] = array($pubmedSearch_item_array['IdList']['Id']); } $pid_url = ""; foreach ($pubmedSearch_item_array['IdList']['Id'] as $ind => $pubid) { $pid_url .= $pubid . ","; } $pid_url = rtrim($pid_url, ","); $fetch_url = $this->pubmed_fetch_url . $pid_url; $fetch_url_parser = new URLParser($fetch_url); $pubmedFetchResultArray = $fetch_url_parser->XMLToArray(); if (!array_key_exists('PubmedArticle', $pubmedFetchResultArray)) { trigger_error("PubmedArticle not set in pubmedFetchResultArray", E_USER_ERROR); return null; } $pubmedFetchResultArray['PubmedArticle'] = isset($pubmedFetchResultArray['PubmedArticle'][0]) ? $pubmedFetchResultArray['PubmedArticle'] : array($pubmedFetchResultArray['PubmedArticle']); foreach ($pubmedFetchResultArray['PubmedArticle'] as $record) { if (!array_key_exists('MedlineCitation', $record)) { trigger_error("MedlineCitation not set in one pubmed record", E_USER_ERROR); continue; } $coauthorList = array(); $title = $record['MedlineCitation']['Article']['ArticleTitle']; $url = 'http://www.ncbi.nlm.nih.gov/pubmed/' . $record['MedlineCitation']['PMID']; if (array_key_exists('AuthorList', $record['MedlineCitation']['Article']) && array_key_exists('Author', $record['MedlineCitation']['Article']['AuthorList']) && !empty($record['MedlineCitation']['Article']['AuthorList'])) { $authorList = $record['MedlineCitation']['Article']['AuthorList']['Author']; } else { trigger_error("authorlist/author not set or author is empty", E_USER_ERROR); continue; } if (!isset($authorList[0])) { continue; } if (count($authorList) > 20) { continue; } foreach ($authorList as $a) { //converting pubmed name to neo4j compatible format $parser = new HumanNameParser_Parser($a['LastName'] . ", " . $a['ForeName']); $first = $parser->getFirst(); $last = $parser->getLast(); if (array_key_exists('AffiliationInfo', $a)) { if (array_key_exists('Affiliation', $a['AffiliationInfo'])) { $affi = $a['AffiliationInfo']['Affiliation']; } else { if (array_key_exists(0, $a['AffiliationInfo']) && array_key_exists('Affiliation', $a['AffiliationInfo'][0])) { $affi = $a['AffiliationInfo'][0]['Affiliation']; } } } else { $affi = ""; } $coauthorList[] = array("firstName" => $first, "lastName" => $last, "affiliation" => $affi); } $returnVal[] = array("title" => $title, "url" => $url, "people" => $coauthorList); } } return $returnVal; }