...
https://en.wikipedia.org/wiki/Web_scraping
https://en.wikipedia.org/wiki/Document_Object_Model
/**
*
* @author jimak
* @date Nov 13, 2022
*/
public class Dom extends Node{
public static void main(String[] args) {
JHTMLParserBuilder.main(args);
}
public Dom() {
super();
getBuffer().append("DOM");
}
public void parse(StringBuffer inputBuf){
int inputLen=inputBuf.length();
int inputI=0;
char ch;
Node workingNode=this;
StringBuffer workingBuf=workingNode.getBuffer();
while(inputI<inputLen){
ch=inputBuf.charAt(inputI);
if(ch!='<'){
workingBuf.append(ch);
}
else{
StringBuffer holeTag=new StringBuffer(" ");
holeTag.append(ch);
++inputI;
boolean closedok=false;
String tagName="";
boolean spaceNotFound=true;
while(inputI<inputLen){
ch=inputBuf.charAt(inputI);
++inputI;
holeTag.append(ch);
if(ch=='>'){
closedok=true;
break;
}
else if(spaceNotFound){
if(ch==' '){spaceNotFound=false;}
else{tagName+=ch;}
}
}
//System.out.println(inputI+" , "+tagName+" : "+holeTag);
if(closedok){
boolean isClosingTag=tagName.charAt(0)=='/';
//System.out.println("Closing:"+isClosingTag);
if(isClosingTag){
String realTag=tagName.substring(1);
Node firstOpenedParent=workingNode.getFirstParentTag(realTag);
if(firstOpenedParent!=null){
Node neoNode=new Node();
workingNode=(Node)firstOpenedParent.getParent();
workingNode.add(neoNode);
workingNode=neoNode;
workingBuf=workingNode.getBuffer();
}
else{
System.out.println(inputI+" , "+tagName+" : "+holeTag);
System.out.println("Didnt find any opening Tag for this ...continueing");
}
}
else{
Tag neoTag=new Tag(holeTag,tagName);
workingNode.add(neoTag);
workingNode=neoTag;
workingBuf=workingNode.getBuffer();
Node neoNode=new Node();
workingNode.add(neoNode);
workingNode=neoNode;
workingBuf=workingNode.getBuffer();
}
}
else{
workingBuf.append(holeTag);
}
--inputI;
}
++inputI;
}
System.out.println("end main parsing.Now must remove empty nodes");
Vector<Node> toBeRemoved=new Vect<Node>();
Enumeration en=preorderEnumeration();
en.nextElement();
while(en.hasMoreElements()){
Node n=(Node)en.nextElement();
StringBuffer nb=n.getBuffer();
String s=nb.toString().trim();
//System.out.println(s.length()+">"+s);
int nbls=s.length();
if(nbls>0){
}
else{
toBeRemoved.add(n);
}
}
int i=0,l=toBeRemoved.size();
System.out.println("must remove "+toBeRemoved.size());
while(i<l){
Node n=toBeRemoved.get(i);
//n.getBuffer().append("To Be Removed");
Node np=(Node)n.getParent();
int nIndex=np.getIndex(n);
np.remove(nIndex);
int chs=n.getChildCount();
--chs;
while(chs>-1){
Node nch=(Node)n.getChildAt(chs);
np.insert(nch, nIndex);
--chs;
}
++i;
}
}
}
Δεν υπάρχουν σχόλια:
Δημοσίευση σχολίου