-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathMain.java
More file actions
49 lines (40 loc) · 1.42 KB
/
Main.java
File metadata and controls
49 lines (40 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Main {
public static DB db = new DB();
public static void main(String[] args) throws SQLException, IOException {
db.runSql2("TRUNCATE Record;");
processPage("http://www.mit.edu");
}
public static void processPage(String URL) throws SQLException, IOException{
//check if the given URL is already in database
String sql = "select * from Record where URL = '"+URL+"'";
ResultSet rs = db.runSql(sql);
if(rs.next()){
}else{
//store the URL to database to avoid parsing again
sql = "INSERT INTO `Crawler`.`Record` " + "(`URL`) VALUES " + "(?);";
PreparedStatement stmt = db.conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);
stmt.setString(1, URL);
stmt.execute();
//get useful information
Document doc = Jsoup.connect("http://www.mit.edu/").get();
if(doc.text().contains("research")){
System.out.println(URL);
}
//get all links and recursively call the processPage method
Elements questions = doc.select("a[href]");
for(Element link: questions){
if(link.attr("href").contains("mit.edu"))
processPage(link.attr("abs:href"));
}
}
}
}