function parseDonorData($text) { $this->_entity_reference = false; $contributors = $this->getContributors($text); $this->printDebug("Found " . count($contributors) . " possible donations"); foreach ($contributors as $contributor) { $contribution = $contributor[0]; $donor = $this->generateDonor($contribution); $first_name_match = self::NO; $last_name_match = self::NO; $middle_name_match = self::NO; $common_name = self::NO; $organization_matches = 0; $city_match = self::NO; $state_match = self::NO; $zip_match = self::NO; $common_city = self::NO; if ($this->person->name_first == $donor->name_first) { $first_name_match = self::YES; } if ($this->person->name_middle == $donor->name_middle) { $middle_name_match = self::YES; } if ($this->person->name_last == $donor->name_last) { $last_name_match = self::YES; } //middle names are set if (strlen($this->person->name_middle) && strlen($donor->name_middle)) { if ($this->person->name_middle == $donor->name_middle) { $middle_name_match = self::YES; } else { //make sure the middle names if (strlen($this->person->name_middle) > 1 && strlen($this->person->name_middle) > 1 && !stristr($this->person->name_middle, $donor->name_middle) && !stristr($donor->name_middle, $this->person->name_middle)) { $middle_name_match = self::AMBIGUOUS; } //initials match if ((strlen($this->person->name_middle) == 1 || strlen($donor->name_middle) == 1) && substr($this->person->name_middle, 0, 1) == substr($donor->name_middle, 0, 1)) { $middle_name_match = self::AMBIGUOUS; } } } if (strlen($donor->name_first) < 2) { $first_name_match = self::AMBIGUOUS; } if (strlen($donor->name_middle) > 0 && strlen($donor->name_middle) < 2) { $middle_name_match = self::AMBIGUOUS; } if (strlen($donor->name_last) < 2) { $last_name_match = self::AMBIGUOUS; } if (in_array($this->person->name_last, LsLanguage::$commonLastNames) && in_array($this->person->name_first, LsLanguage::$commonFirstNames)) { $common_name = self::YES; } //checking organizations $this->printDebug(" Donor name: " . $donor->name_first . " " . $donor->name_middle . " " . $donor->name_last); $this->printDebug(" Donor address: " . $donor->Address[0]->State->name . ", " . LsLanguage::titleize($donor->Address[0]->city) . ", " . $donor->Address[0]->postal); $this->printDebug(" Donor organization: " . LsLanguage::titleize(trim($donor->summary))); $this->printDebug(" Person name: " . $this->person->name_first . " " . $this->person->name_middle . " " . $this->person->name_last); //checking address foreach ($this->person->Address as $key => $address) { $this->printDebug(" Person Address: " . $address->State->name . ", " . $address->city . ", " . $address->postal . " "); if ($this->person->Address[$key]->State->name == $donor->Address[0]->State->name) { $state_match = self::YES; } if (LsLanguage::titleize($this->person->Address[$key]->city) == LsLanguage::titleize($donor->Address[0]->city)) { $city_match = self::YES; } if (substr($this->person->Address[$key]->postal, 0, 3) == substr($donor->Address[0]->postal, 0, 3)) { $zip_match = self::AMBIGUOUS; } if ($this->person->Address[$key]->postal == $donor->Address[0]->postal) { $zip_match = self::YES; } if (in_array($donor->Address[0]->postal, $this->temp_postal)) { $zip_match == self::YES; } if (in_array($donor->Address[0]->city, LsLanguage::$commonCities)) { $common_city = self::YES; } break; //currently support only one address; } //check that first and last names are exact match /*$q = LsDoctrineQuery::create() ->from('Entity e') ->leftJoin('e.Relationship r ON (r.entity2_id = e.id)') ->where('r.entity1_id = ? AND r.category_id = ?', array($this->person->id, RelationshipTable::POSITION_CATEGORY));*/ $orgs = $this->person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 1)->execute(); //$orgs = $q->execute(); $bio = $this->person->summary; $aliases = $this->person->Alias; foreach ($aliases as $alias) { $this->printDebug(" Aliases: " . $alias->name . "..."); $alias_name = LsLanguage::getCommonPronouns($this->person->name, $alias->name, array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar, LsLanguage::$states, LsLanguage::$geography, array($this->person->name_last, $this->person->name_first, $this->person->name_middle, $this->person->name_nick, 'Retired', 'Requested', 'Info', 'Employed'))); $bio .= ' ' . $alias_name; } foreach ($orgs as $org) { $this->printDebug(" Person organizations: " . $org->name . "..."); $bio .= ' ' . $org->name; } $summary_matches = LsLanguage::getCommonPronouns(LsLanguage::titleize(trim($donor->summary)), trim($bio), array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar, LsLanguage::$states, LsLanguage::$geography, array($this->person->name_last, $this->person->name_first, $this->person->name_middle, $this->person->name_nick, 'Retired', 'Requested', 'Info', 'Employed'))); $this->printDebug(" Person organizations: " . $bio); $organization_matches = count($summary_matches); echo ' '; echo ' Matching First: ' . self::$labels[$first_name_match] . ", "; echo ' Last: ' . self::$labels[$last_name_match] . ", "; echo ' Middle: ' . self::$labels[$middle_name_match] . ", "; echo ' City: ' . self::$labels[$city_match] . ", "; echo ' State: ' . self::$labels[$state_match] . ", "; echo ' Zip: ' . self::$labels[$zip_match] . ", "; echo 'Organization count: ' . $organization_matches; if (count($summary_matches)) { $i = 0; echo " ("; foreach ($summary_matches as $key => $o) { echo $o; if ($i != count($summary_matches) - 1) { echo ', '; $i++; } } echo ")"; } echo "\n"; $confident = false; /* direct hit */ if ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match == self::YES && $city_match == self::YES && $zip_match == self::YES) { $this->printDebug(" CONFIDENT 1"); $confident = true; } elseif ($first_name_match > self::NO && $middle_name_match > self::NO && $last_name_match == self::YES && $organization_matches > 1 && !$common_name) { $this->printDebug(" CONFIDENT 2 (not common name)"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match > self::NO && $organization_matches && !$common_name) { $this->printDebug(" CONFIDENT 3 (not common name)"); $confident = true; } elseif ($first_name_match == self::AMBIGUOUS && $middle_name_match == self::YES && $last_name_match == self::YES && $state_match > self::NO && $organization_matches && !$common_name) { $this->printDebug(" CONFIDENT 4 (not common name)"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match > self::NO && $city_match > self::NO && $zip_match > self::NO && !$common_city) { $this->printDebug(" CONFIDENT 5"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match > self::NO && $city_match > self::NO && $zip_match > self::NO && !$common_city && !$common_name) { $this->printDebug(" CONFIDENT 6"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match > self::NO && $city_match > self::NO && $zip_match > self::NO) { $this->printDebug(" CONFIDENT 7"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match > self::NO && $city_match > self::NO && $zip_match > self::NO) { $this->printDebug(" CONFIDENT 8"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $state_match > self::NO && $city_match > self::NO && $zip_match > self::NO) { $this->printDebug(" CONFIDENT 9"); $confident = true; } elseif ($first_name_match == self::YES && $middle_name_match > self::NO && $last_name_match == self::YES && $zip_match > self::YES && $organization_matches) { $this->printDebug(" CONFIDENT 10"); $confident = true; } if ($this->prompt == 1) { $accept = $this->readline(' Is this the same entity? (y or n)'); $attempts = 1; while ($accept != 'y' && $accept != 'n' && $attempts < 5) { $accept = $this->readline(' Is this the same entity? (y or n) '); $attempts++; } if ($accept == 'y') { $confident = true; } else { $confident = false; } } if ($confident) { $this->parseRecipients($contribution); $this->temp_postal[] = $donor->Address[0]->postal; } else { $this->printDebug(" NO CONFIDENCE. SKIPPING...\n"); } } }
public function getCommonBioPronouns($str) { $eb = $this->getExtendedBio(); $summary_matches = LsLanguage::getCommonPronouns(LsLanguage::titleize(trim($eb)), trim($str), array_merge(LsLanguage::$business, LsLanguage::$schools, LsLanguage::$grammar, LsLanguage::$states, LsLanguage::$geography, array($this->name_last, $this->name_first, $this->name_middle, $this->name_nick, 'Retired', 'Requested', 'Info', 'Employed'))); return $summary_matches; }
function import(Entity $person, $possible_persons) { //loop through the people we found. usually just one. foreach ($possible_persons as $possible_person) { $this->printDebug('Query returned ' . count($possible_person) . ' person named ' . $possible_person->name); //this person does not provide education. we skip if (count($possible_person->education)) { $this->printDebug('Education found'); } else { $this->printDebug('No education history found'); continue; } //get employement info for this possible match $possible_person_bio = $possible_person->summary; if (count($possible_person->employment_history)) { foreach ($possible_person->employment_history as $employment) { $possible_person_bio .= ' ' . $employment->company . " "; } $this->printDebug('Employment found'); } else { $this->printDebug('No employment history found'); continue; } //get employment info for the person in our database $relationship_orgs = $person->getRelatedEntitiesQuery('Org', RelationshipTable::POSITION_CATEGORY, null, null, null, false, 1)->execute(); $person_bio = $person->summary; foreach ($relationship_orgs as $org) { $person_bio .= ' ' . $org->name; } //lets see how many matches we get $matches = LsLanguage::getCommonPronouns($person_bio, trim($possible_person_bio), LsLanguage::$business); if (count($matches)) { foreach ($possible_person->education as $school) { $school->institution = mb_convert_encoding($school->institution, 'UTF-8'); $school->institution = preg_replace('//isu', ' ', $school->institution); $this->printDebug('Looking for the school: ' . $school->institution); $current_school = EntityTable::findByAlias($school->institution, $context = 'bw_school'); //find school if ($current_school) { $this->printDebug('Found school'); } else { $current_school = EntityTable::getByExtensionQuery(array('Org', 'School'))->addWhere('LOWER(org.name) LIKE ?', '%' . strtolower($school->institution) . "%")->fetchOne(); if (!$current_school) { $new_school = new Entity(); $new_school->addExtension('Org'); $new_school->addExtension('School'); $new_school->name = $school->institution; $wikipedia = new LsWikipedia(); $wikipedia->request($school->institution); if ($wikipedia->execute() && !$wikipedia->isDisambiguation()) { $info_box = $wikipedia->getInfoBox(); if (isset($info_box['students']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['students']['clean'], $match)) { $new_school->students = LsNumber::clean($match[1]); } else { $student_types = array('undergrad', 'postgrad', 'grad', 'doctoral'); $num_students = 0; foreach ($student_types as $st) { if (isset($info_box[$st]) && preg_match('/([\\d\\,]{2,})/isu', $info_box[$st]['clean'], $match)) { $num_students += LsNumber::clean($match[1]); } } if ($num_students > 0) { $new_school->students = $num_students; } } if (isset($info_box['faculty']) && preg_match('/([\\d\\,]{2,})/isu', $info_box['faculty']['clean'], $match)) { $new_school->faculty = LsNumber::clean($match[1]); } if (isset($info_box['type'])) { if (stristr($info_box['type']['clean'], 'public')) { $new_school->is_private = 0; } else { if (stristr($info_box['type']['clean'], 'private')) { $new_school->is_private = 1; } } } if (isset($info_box['endowment'])) { if (preg_match('/(\\$[\\d\\,\\.\\s]+)(million|billion)/isu', $info_box['endowment']['clean'], $match)) { if (strtolower($match[2]) == 'billion') { $factor = 1000000000; } else { $factor = 1000000; } $new_school->endowment = LsNumber::formatDollarAmountAsNumber($match[1], $factor); } } if (isset($info_box['established'])) { $year = null; if ($date = LsDate::convertDate($info_box['established']['clean'])) { $new_school->start_date = $date; } else { if (preg_match('/\\b(\\d\\d\\d\\d)\\b/isu', $info_box['established']['clean'], $match)) { $new_school->start_date = $match[1]; } } } $summary = trim($wikipedia->getIntroduction()); $summary = preg_replace('/\\n\\s*\\n/isu', '', $summary); if (strlen($summary) > 10) { $new_school->summary = $summary; } $new_school->save(); $new_school->addReference($source = $wikipedia->getUrl(), $excerpt = null, $fields = array('summary'), $name = 'Wikipedia'); } else { $new_school->save(); } $current_school = $new_school; $this->printDebug('Adding new school'); } $alias = new Alias(); $alias->name = $school->institution; $alias->context = 'bw_school'; $alias->Entity = $current_school; $alias->save(); } //find degree $degree = null; if (!($degree = DegreeTable::getByText($school->degree))) { $degree = DegreeTable::addDegree($school->degree); $this->printDebug('Adding new degree'); } //find relationship $relationship = null; $relationships = $person->getRelationshipsWithQuery($current_school, RelationshipTable::EDUCATION_CATEGORY)->execute(); foreach ($relationships as $existing_relationship) { if ($existing_relationship->degree_id == $degree->id) { $relationship = $existing_relationship; break; } } if ($relationship) { $this->printDebug('Relationship between person and school exists'); } else { $relationship = new Relationship(); $relationship->Entity1 = $person; $relationship->Entity2 = $current_school; $relationship->description1 = 'student'; $relationship->is_current = 0; if ($school->year) { $relationship->end_date = $school->year; } $relationship->setCategory('Education'); $this->printDebug('Creating new relationship between person and school'); } //save $relationship->save(); //add degree and reference if ($relationship->degree_id == null) { $reference_name = strstr($school->source, 'wikipedia') ? "Wikipedia" : "BusinessWeek"; $relationship->Degree = $degree; $relationship->save(); $relationship->addReference($source = $school->source, $excerpt = null, $fields = array('degree_id'), $name = $reference_name, $detail = null, $date = null); $this->printDebug('Adding degree and reference'); } } } else { $this->printDebug('No organization matches'); return false; } } return true; }
static function areSame(Entity $p1, Entity $p2, $consistency = false, $reporting = false) { //FIRST WE GET THE RELEVANT VARIABLES //first person $f1 = $p1->name_first; $m1 = $p1->name_middle; $l1 = $p1->name_last; $d1 = new LsDate($p1->start_date); $b1 = $p1->summary; //second person $f2 = $p2->name_first; $m2 = $p2->name_middle; $l2 = $p2->name_last; $d2 = new LsDate($p2->start_date); $b2 = $p2->summary; //DEFINE CONDITIONS //lasts $lasts = (object) ''; $lasts->nonempty = $l1 && $l2; $lasts->match = $l1 == $l2; $lasts->noninitial = strlen($l1) > 1 && strlen($l2) > 1; $lasts->subset = stristr($l1, $l2) || stristr($l2, $l1); $lasts->compatible = !$lasts->nonempty || $lasts->subset; $lasts->uncommon = !in_array($l1, LsLanguage::$commonLastNames); //firsts $firsts = (object) ''; $firsts->nonempty = $f1 && $f2; $firsts->match = $f1 == $f2; $firsts->noninitial = strlen($f1) > 1 && strlen($f2) > 1; $firsts->subset = stristr($f1, $f2) || stristr($f2, $f1); $firsts->compatible = !$firsts->nonempty || $firsts->subset; $firsts->uncommon = !in_array($f1, LsLanguage::$commonFirstNames); //middles $middles = (object) ''; $middles->nonempty = $m1 && $m2; $middles->match = $m1 == $m2; $middles->noninitial = strlen($m1) > 1 && strlen($m2) > 1; $middles->subset = stristr($m1, $m2) || stristr($m2, $m1); $middles->compatible = !$middles->nonempty || $middles->subset; //dates $dates = (object) ''; $dates->nonempty = !$d1->isBlank() && !$d2->isBlank(); $dates->match = $dates->nonempty && (string) $d1 == (string) $d2; $dates->compatible = self::birthDatesAreCompatible($d1, $d2); //bios $bios = (object) ''; $bios->nonempty = $b1 && $b2; $bios->count = count(LsLanguage::getCommonPronouns($b1, $b2, array_merge(array($f1, $l1, $f2, $l2), LsLanguage::$business, LsLanguage::$months, LsLanguage::$prefixes, LsLanguage::$schools, LsLanguage::$grammar))); //REPORTING $report = array($lasts, $firsts, $middles, $dates, $bios); //CHECK BUNDLES OF CONDITIONS FOR MATCHING if ($lasts->match && $lasts->noninitial && $firsts->match && $firsts->noninitial && $middles->nonempty && $middles->match && $dates->compatible && $dates->nonempty) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $firsts->nonempty && $firsts->match && $middles->match && $middles->noninitial && $dates->compatible && $dates->nonempty) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $lasts->uncommon && $firsts->match && $firsts->noninitial && $firsts->uncommon && $middles->compatible && $dates->compatible && $dates->nonempty) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $lasts->uncommon && $firsts->nonempty && $firsts->match && $middles->nonempty && $middles->match && $dates->compatible && $dates->nonempty) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $lasts->uncommon && $firsts->nonempty && $firsts->subset && $middles->match && $middles->noninitial && $dates->compatible && $dates->nonempty) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $lasts->uncommon && $firsts->match && $firsts->noninitial && $middles->match && $middles->noninitial && $dates->compatible) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $firsts->match && $firsts->noninitial && $middles->compatible && $dates->compatible && $bios->count > 7) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $firsts->match && $firsts->nonempty && $middles->match && $middles->nonempty && $dates->compatible && $bios->count > 7) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $lasts->uncommon && $firsts->nonempty && $firsts->match && $middles->compatible && $dates->compatible && $dates->nonempty && $bios->count > 7) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $firsts->nonempty && $firsts->subset && $middles->compatible && $dates->compatible && $dates->nonempty && $bios->count > 7 && !$consistency) { array_unshift($report, true); return $reporting ? $report : true; } if ($lasts->match && $lasts->noninitial && $firsts->nonempty && $firsts->subset && $dates->compatible && $bios->count > 15 && !$consistency) { array_unshift($report, true); return $reporting ? $report : true; } array_unshift($report, false); return $reporting ? $report : false; }