-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.cpp
More file actions
115 lines (95 loc) · 2.51 KB
/
main.cpp
File metadata and controls
115 lines (95 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#include "index.h"
#include "link.h"
#include <chrono>
#include <fstream>
#include <iostream>
#include <regex>
#include "exporter.h"
#include "page.h"
using namespace std;
Index wikiIndex;
Exporter exporter("wiki");
void worker(string page) {
regex titleRegex("<title>([^<>]*)</title>");
regex redirectRegex("<redirect +title=\"([^\"]*)\" */>");
regex textRegex("<text[^<>]*>([^<>]*)</text>");
smatch m;
bool haveRedirect = false;
string title, redirect, text;
if (regex_search(page, m, titleRegex))
title = m[1];
if (regex_search(page, m, redirectRegex))
redirect = m[1], haveRedirect = true;
if(haveRedirect)
{
Page page(title, redirect);
exporter.exportPage(page.exportPage());
}
else {
int textStart = page.find("<text"); // this is not bug
int textEnd = page.find("</text>");
bool start = false;
for (int i = textStart; i < textEnd; i++) {
if (start)
text += page[i];
if (page[i] == '>')
start = true;
}
auto links = LinkRecognizer::parseLinks(text);
Page page(title);
for(auto link: links)
page.addLink(link);
exporter.exportPage(page.exportPage());
}
}
void process()
{
ifstream in("enwiki-20190501-pages-articles-multistream.xml");
string page;
bool inPage = false;
const string startTag = "<page>";
const string endTag = "</page>";
int nextMatching = 0;
int numberOfPages = 0;
auto begin = chrono::high_resolution_clock::now();
while (!in.eof()) {
char c;
c = in.get();
if (!inPage) {
if (c == startTag[nextMatching]) {
nextMatching++;
if (nextMatching == startTag.length()) {
inPage = true;
nextMatching = 0;
}
} else
nextMatching = 0;
} else {
if (c == endTag[nextMatching]) {
nextMatching++;
if (nextMatching == endTag.length()) {
inPage = false;
nextMatching = 0;
numberOfPages++;
worker(page);
page = "";
auto end = chrono::high_resolution_clock::now();
auto dur = end - begin;
auto ms = chrono::duration_cast<chrono::milliseconds>(dur).count();
if (numberOfPages % 1000 == 0)
cout << "Number of pages: " << numberOfPages
<< " PPS: " << 1.0 * numberOfPages / (ms / 1000) << endl;
}
} else
nextMatching = 0;
}
if (inPage)
page += c;
}
in.close();
}
int main() {
process();
exporter.finalize();
cout << "Max ID: " << Page::nextIndex << endl;
}