/
add_documents.php
executable file
·126 lines (101 loc) · 3.55 KB
/
add_documents.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
<html>
<head>
<meta charset="UTF-8">
<title>Admin</title>
<link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.2.0/css/bootstrap.min.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
<script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.2.0/js/bootstrap.min.js"></script>
<style>
body{margin-top:50px;}
.glyphicon { margin-right:10px; }
.panel-body { padding:0px; }
.panel-body table tr td { padding-left: 15px }
.panel-body .table {margin-bottom: 0px; }
</style>
</head>
<body>
<div class="container">
<?php
include_once './files_functions.php';
include_once './index_functions.php';
//connections parameters
$servername = "localhost";
$username = "ruben_ben";
$password = "ruben_ben";
$dbname = "information_retrieval";
$conn = new mysqli($servername, $username, $password, $dbname);
// Check connection
if ($conn->connect_error) {
die("Connection failed: " . $conn->connect_error);
}
$index = get_index();
$files = scandir('source/', 1);
$doc_arr = array();
//read the files for each file: save first 3 rows to 3 first spot in array
//and save the rest of the file in the forth place
foreach ($files as $file) {
if ($file[0] != '.') { //ignore file '.' and '..' that scandir creates
$fp = fopen('source/' . $file, "r");
$doc = array();
$counter = 0;
$size = 0;
while (($line = trim(fgets($fp))) !== false) {
if ($counter > 2) {
break;
}
$doc[] = $line;
$size += strlen($line);
$counter++;
}
$doc[] = stream_get_contents($fp, -1, $size); //the rest of the file
//add the doc to the doc array
$doc_arr[] = $doc;
}
}
//get words (create an array with all the words in all the document, and the id of the doc in which they appear
$words_arr = array();
foreach ($doc_arr as $doc) {
$id = save_doc($doc);
$terms = str_word_count($doc[3], 1);
foreach ($terms as $term) {
$words_arr[] = array("term" => strtolower($term), "id" => $id);
}
}
//sort
$term_arr = array();
foreach ($words_arr as $key => $row) {
$term_arr[$key] = $row['term'];
}
array_multisort($term_arr, SORT_ASC, $words_arr);
//remove duplicates
$temp_index = array(); //will be the final index array (the posting files)
$prev_row["term"] = "";
$prev_row["id"] = "";
foreach ($words_arr as $row) {
if ($row["term"] != $prev_row["term"]) {
$temp_index[$row["term"]] = array("hits" => 1, "docs" => array($row["id"]));
} else if ($row["id"] != $prev_row["id"]) {
$temp_index[$row["term"]]["docs"][] = $row["id"];
$temp_index[$row["term"]]["hits"] ++;
}
$prev_row = $row;
}
//insert to the index
foreach ($temp_index as $term => $docs) {
$index = save_term_to_index($term,$docs["hits"],$docs["docs"],$index);
}
set_index($index); //update the database
//delete files from source folder
$fp = opendir('source/');
while (false !== ($file = readdir($fp))) {
if (is_file('source/' . $file)) {
unlink('source/' . $file);
}
}
?>
<h1> Added </h1>
<h3><a href="index.php" role="button" class="btn btn-primary"><span class="glyphicon glyphicon-search"></span> Start Searching</a></h3>
<h3><a href="admin.php" role="button" class="btn btn-primary"><span class="glyphicon glyphicon-user"></span> Back To Admin</a></h3>
</div>
</body>
</html>