Featured Post

URLReader revised


package com.seo;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class URLReader {

public URLReader(String url,String className,String tagName,String subject,String emails[]) {
HashSet hs = new HashSet();
URL u = null;
try {
u = new URL(url);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
Document doc = Jsoup.parse(u, 1000 * 60 * 10);
Elements els = doc.getElementsByTag("a");
System.out.println("HOST:--->" + u.getHost());
for (Element el : els) {
if(el.attr("href").contains("void(0)"))
continue;
if (el.attr("href").contains(u.getHost())) {
System.out.println(el.attr("href"));
hs.add(el.attr("href"));
URLThread ut = new URLThread(el.attr("href"),className,tagName,subject,emails);
Thread t = new Thread(ut);
t.setName(el.attr("href"));
t.start();
System.out.println("A:-->" + t.getName() + " STARTED");
} else {
System.out.println("No Host:--->" + u.getHost()
+ el.attr("href"));

// hs.add(el.attr("href"));
if(!el.attr("href").contains("http")){
URLThread ut = new URLThread("http://"+u.getHost()
+ el.attr("href"),className,tagName,subject,emails);
Thread t = new Thread(ut);
t.setName(u.getHost() + el.attr("href"));
t.start();
System.out.println("B:-->" + t.getName() + " STARTED");
}

}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println(hs);
}

public static void main(String... args) throws IOException {
if(args.length<6 div="">
System.out.println(args.length+":--->"+"Usage url,className,tagName,subject,emails[],dump ");
return;
}
FileOutputStream fos=new FileOutputStream("urls.txt");
PrintStream ps=new PrintStream(fos);
FileInputStream fis=new FileInputStream("misc.txt");
DataInputStream dis=new DataInputStream(fis);
String DATA="";
HashSet hs=new HashSet();
while(DATA!=null){
try {
DATA=dis.readLine();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(DATA==null)
break;
hs.add(DATA);
}
URL u=null;
if(Boolean.parseBoolean(args[5]))
{
try {
u = new URL(args[0]);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
try {
Document doc = Jsoup.parse(u, 1000 * 60 * 10);
Elements els = doc.getElementsByTag("a");
System.out.println("HOST:--->" + u.getHost());
for (Element el : els) {
if(el.attr("href").contains("void(0)"))
continue;
Iterator i=hs.iterator();
boolean contained=false;
while(i.hasNext()){
String ft=i.next();
System.out.println("FT:-->"+ft);
if(el.attr("href").contains(ft))
{
contained=true;
break;
}
}
if(contained)
continue;
if (el.attr("href").contains(u.getHost())) {
ps.println(el.attr("href"));
} else {
System.out.println("No Host:--->" + u.getHost()
+ el.attr("href"));

if(!el.attr("href").contains("http")){
ps.println("http://"+u.getHost()
+ el.attr("href"));
}

}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
FileInputStream fisU=new FileInputStream("urls.txt");
DataInputStream disU=new DataInputStream(fisU);
DATA="";
HashSet hsU=new HashSet();
while(DATA!=null){
try {
DATA=disU.readLine();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(DATA==null)
break;
hsU.add(DATA);
}
disU.close();
fisU.close();
fos.close();
File f=new File("urls.txt");
if(f.delete()){
System.out.println("<------------------file been="" deleted:-------------------="" has="">"+f.getAbsolutePath());
}else{
System.out.println("<------------------problem deleting="" file:-------------------="" the="" while="">"+f.getAbsolutePath());
}
FileOutputStream fosU=new FileOutputStream("urls-unique.txt");
PrintStream psU=new PrintStream(fosU);
for(String st:hsU){
psU.println(st);
}
}else{
String data[]=args[4].split("\\,");
URLReader url = new URLReader(
args[0],args[1],args[2],args[3],data);
}
}

}

Comments