Created
July 13, 2020 12:12
-
-
Save JoeGlines/6f1c0710fa4abc2bcf751dd2108c62d9 to your computer and use it in GitHub Desktop.
Rip a page and shove it into Excel, then verify links, etc.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#SingleInstance, Force | |
#NoEnv | |
SetBatchLines -1 ;run as fast as possible | |
;~ DetectHiddenWindows, On | |
;~ ListLines On ;on helps debug a script | |
SendMode Input ; Recommended for new scripts due to its superior speed and reliability. | |
Menu, tray, icon, B:\Progs\AutoHotKey_l\Icons\Win\ico_shell32_dll0210.ico, 1 | |
Menu, Tray, Add, Change ID (Control + I), Change_ID | |
Browser_Forward::Reload | |
Browser_Back:: | |
GoSub Change_ID | |
return | |
; to do | |
; 1) put in loop | |
; 2) break out root between UAT and ti.com | |
; 3) compare uat to ti.com | |
Run_It: | |
TopCounts:=15 ;control # of top words on page extracted | |
Wait_For_Server:=2000 ;wait secons for server to respond to ping | |
gosub Extract_HTML | |
gosub Site_Catalyst | |
gosub Word_Count | |
gosub Import_Excel | |
IfEqual, verify,1,gosub Verify_href | |
gosub End | |
return | |
Change_ID: | |
^i:: | |
pwb:=GetIE() | |
all:=pwb.document.all ;.tags("option") ;.tags("A") | |
while (A_Index<=all.length) | |
if all[A_Index-1].id ;.options ;leaving blank means it exists | |
ids.=all[A_Index-1].id . "|" | |
StringTrimRight, ids, ids, 1 | |
Gui,Add, Button,Default, Submit | |
Gui,Add, Button,x+20 ,Cancel | |
Gui,Add, Radio, x+20 vVerify group , Verify URLs | |
Gui,Add, Radio, Checked y+10 , Don't Verify URLs | |
Gui,Add, DropDownList, x6 y+10 w200 vBody_ID , %ids% ;Black|White|Red|Green|Blue | |
GuiControl,ChooseString, body_id, ls-row-3-col-2 ;Try and select this one by default | |
Gui Show, h75 w250, Test | |
ids:="" ;Clear Ids | |
return | |
ButtonCancel: | |
Gui, Destroy | |
Return | |
ButtonSubmit: | |
Gui, Submit | |
Gui, Destroy | |
gosub Run_It | |
return | |
;******************************************************* | |
;*********************Webpage********************************** | |
;******************************************************* | |
Extract_HTML: | |
StartTime := A_TickCount ;Get time started to check how long it takes | |
pwb:=GetIE() | |
;~ pwb:=setWbCom(name:="",url:="http://www.ti.com/") ;get pointer TI | |
;~ to do ; give error if ie not running- make sure it is ie running and on page | |
url:=pwb.locationURL | |
;**********************get tab title and trim / replace for illegal chars********************************* | |
Full_Tab_Title:= pwb.Document.title ;getting tab title | |
StringReplace, Tab_Title, Full_Tab_Title, - TI.com,, ;trimming - TI.com as redundant and need to be shorter | |
Tab_Title:= RegExReplace(Tab_Title, "[#/\\:&\*\?\{<>|\]\.]", "_") ;replace illegal chars www.autohotkey.com/community/viewtopic.php?f=1&t=13544&hilit=regexreplace | |
Tab_Title := RegExReplace(Tab_Title, "_+", " ") All ; replace mutliple _ with space | |
Tab_Title :=RegExReplace(Tab_Title,".*?-(.*)","$1") | |
;~ MsgBox,,title, % Tab_Title | |
StringLeft,Tab_Title,Tab_Title,31 ;trim to first 31 charachters ;~ MsgBox,,title, % Tab_Title | |
;*************************Grab just inner framework area****************************** | |
Test_Section:= pwb.document.getElementByID(Body_ID) | |
if (Test_Section="") { | |
MsgBox % "The ID / element could not be found on this page:`n`n" Tab_Title "`n" url | |
return | |
} | |
;******************************************************* | |
Tag:="a" ;what tags looking for. Should do other than just a? | |
Test_Section_CT:=Test_Section.all.tags(TAG).length ;count of all tags under above | |
msg:="Links=" Test_Section_CT "`t`t" Full_Tab_Title "`t" URL "`r" ;~ MsgBox,,title, % msg | |
msg.="Link #`tName`tInnerText`thref`tStatus`tOuterHTML`n" | |
;**********************wrap line breaks in Innertext with quotes********************************* | |
loop %Test_Section_CT% { | |
Inner_Text:= Test_Section.all.tags(TAG)[A_Index-1].InnerText | |
Test_Section.all.tags(TAG)[A_Index-1].outerhtml:="<span style='color:blue'>" A_Index . ") </span>" . Test_Section.all.tags(TAG)[A_Index-1].OuterHTML ;added this line | |
IfInString,Inner_Text,`n | |
{ ;line break in text | |
StringReplace, Inner_Text, Inner_Text, `n, , All | |
StringReplace, Inner_Text, Inner_Text, `r, (chr10), All | |
Run_Search_Replace:=1 | |
} | |
;**********************Href-********************************* | |
href:= Test_Section.all.tags(TAG)[A_Index-1].href | |
name:= Test_Section.all.tags(TAG)[A_Index-1].name | |
;**********************remove line breaks in HTML ********************************* | |
Outer_HTML:= Test_Section.all.tags(TAG)[A_Index-1].OuterHTML | |
IfInString,Outer_HTML,`n | |
{ | |
StringReplace, Outer_HTML, Outer_HTML, `r`n, , All | |
StringReplace, Outer_HTML, Outer_HTML, `n, , All | |
StringReplace, Outer_HTML, Outer_HTML, `r, , All | |
Outer_HTML:="""" . Outer_HTML . """" | |
} | |
;**********************remove tabs********************************* | |
IfInString,Outer_HTML,%tab% | |
{ | |
StringReplace, Outer_HTML, Outer_HTML,%A_Tab%, , All | |
Outer_HTML:="""" . Outer_HTML . """" | |
} | |
msg.=A_index "`t" name "`t" Inner_Text "`t" href "`t`t" Outer_HTML "`n" | |
Line_Break_in_Text= , name= | |
} | |
Clipboard:=msg | |
msg= | |
SplitPath, url, Page_name, dir, ext, name_no_ext, drive | |
File_Name:=dir page_name | |
StringReplace, File_name, File_name, http://,, | |
StringReplace, File_name, File_name, /,_,,all | |
FileDelete, %file_name%.html | |
page:=pwb.document.documentElement.OuterHTML | |
HTML_page = | |
( Ltrim Join | |
<!DOCTYPE html> | |
<html> | |
<head> | |
</head> | |
<body> | |
%page% | |
</body> | |
</html> | |
) | |
FileAppend, %HTML_page%,%A_ScriptDir%\%file_name%.html,UTF-8 | |
return | |
;******************************************************* | |
;**********************Excel********************************* | |
;************************************************** | |
Import_Excel: | |
;~ path:=A_ScriptDir "\" Text_File ;~ MsgBox,,title, % path | |
Sleep 200 | |
;~ xl:=XL_Start_Get(XL,1) ;WRB is pointer to workbook, Vis=0 for hidden Try=0 for new Excel | |
try | |
{ | |
;~ XL := ComObjActive("Excel.Application") ;handle | |
XL:=XL_Handle(1) ;XL_Handle(XL,1) ;1=Application 2=Workbook 3=Worksheet | |
xl.Worksheets.Add().Name := Tab_Title | |
} Catch { | |
XL := ComObjCreate("Excel.Application") ;handle | |
XL.Visible := 1 ;1=Visible/Default 0=hidden | |
sleep, 500 | |
xl.Workbooks.Add | |
Sleep, 200 | |
xl.Worksheets.Add().Name := Tab_Title | |
} | |
WinActivate, ahk_class XLMAIN | |
Sleep 200 | |
XL_Paste2(XL,Dest_RG:="a1",Paste:=1) | |
Header_RG:="A1:E2" ;Set header range | |
;**********************set tab title to reflect page title********************************* | |
XL.Application.ActiveSheet.Range("B1").value:= Page_Name ;page name for Site Catalyst | |
;~ XL_Add_Comment(XL,RG:="b1",Comment:=Content_Group,Vis:=0,Size:=11,Font:="Book Antique",ForeClr:=5) | |
XL_Insert_Comment(XL,RG:="b1",Comment:=Content_Group,Vis:=0,Size:=11,Font:="Arial",ForeClr:=5) | |
XL.Application.ActiveSheet.Range("F1").value:= top_words ;page name for Site Catalyst | |
XL_Insert_Comment(XL,RG:="F1",Comment:="Top " TopCounts " words on page",Vis:=0,Size:=11,Font:="Book Antique",ForeClr:=5) | |
XL_Freeze(XL,Row:="2") ;Col A will not include cols which is default so leave out if unwanted | |
LR:=XL_Last_Row(XL) | |
XL_Format_HAlign(XL,RG:=Header_RG,h:=2) ;1=Left 2=Center 3=Right | |
XL_Format_VAlign(XL,RG:=Header_RG,v:=4) ;1=Top 2=Center 3=Distrib 4=Bottom | |
XL_Format_Font(XL,RG:=Header_RG,Font:="Arial Narrow",Size:=11) ;Arial, Arial Narrow, Calibri | |
XL.Range("A1:F1").Interior.ColorIndex := 19 ;Shade header row yellow | |
XL.Range("A2:F2").Interior.ColorIndex := 6 ;Intense Yellow | |
XL.Range("A1:F2").Font.Bold := 1 ;Bold | |
XL_Border(XL,RG:=Header_RG,Weight:=2,Line:=2) ;1=Hairline 2=Thin 3=Med 4=Thick ;Line1=Solid 2=Dash 4=DashDot 5=DashDotDot | |
XL_Row_Height(XL,RG:="1:" LR "=-1") ;rows first then height -1 is auto | |
XL_Col_Width_Set(XL,RG:="A=10|B=30|C=30|D=90|E=8|F=175") ;-1 is auto | |
;**********************replace (chr10) with <br>********************************* | |
if (Run_Search_Replace =1) | |
XL.Range("C2:C" LR).Replace("(chr10)",Chr(10)) ;need to convert to function | |
;**********************hyperlink********************************* | |
XL_Hyperlink_Offset_Col2(XL,RG:="a3:a" LR,URL:="3",Freindly:="0") ;Neg values are rows Above/ Pos are Rows below | |
return | |
;**********************verify url in href********************************* | |
Verify_href: | |
XL:=XL_Handle(1) ;XL_Handle(XL,1) ;1=Application 2=Workbook 3=Worksheet | |
LR:=XL_Last_Row(XL) | |
Verify_Link(XL,RG:="D3:D" LR ,Col_Dest:=2) | |
return | |
;******************************************************* | |
;**********************Content********************************* | |
;******************************************************* | |
Content: | |
pwb:=GetIE() | |
TAG:="DIV" | |
;~ msgbox % pwb.document.getElementByID(Body_ID).getElementsByTagName("A")[0].innerTEXT | |
Div_CT:=pwb.document.getElementByID(Body_ID).All.Tags(TAG).length -1 ;[0].innerTEXT | |
Grp:=pwb.document.getElementByID(Body_ID).All.Tags(TAG) | |
Loop, %Div_CT% { ;loop over all Div | |
Text.= "`n" . Grp[A_Index].Innertext ;append with line break | |
} | |
Text:= RegExReplace(text, "(^|\R)\K\s+") ;remove blank lines | |
Xl.Sheets.Add ; Worksheet.Add ;add a new workbook | |
xl.activesheet.Name := "Content" | |
Clipboard:=text | |
XL.Application.ActiveSheet.Range("A1").PasteSpecial() | |
return | |
;**********************Site Catalyst ********************************* | |
Site_Catalyst: | |
pwb:=GetIE() | |
text := pwb.document.documentElement.innerHTML | |
RegExMatch(text,"tiPageName\s?=\s?""(.*?)"";",Page_Name) ; making it greedy so it gets the last one,not the first one | |
StringLower,Page_Name,Page_Name1 | |
RegExMatch(text,"tiContentGroup\s?=\s?""(.*?)"";",Content_Group) ; making it greedy so it gets the last one,not the first one | |
StringLower,Content_Group,Content_Group1 | |
return | |
;**********************page content- most freq value count********************************* | |
Word_Count: | |
pwb:=GetIE() | |
text:=pwb.Document.body.innertext | |
top_words:=DuplicateFinderAndCounter(text,TopCounts) | |
return | |
;******************************************************* | |
;**********************End********************************* | |
;******************************************************* | |
end: | |
EndTime := A_TickCount | |
Elapsed:=EndTime - StartTime ;~ timetook:=MStoM(Elapsed) | |
MsgBox, % "Verification is done and it took " MStoM(Elapsed) ; Returns 945m 46s | |
return | |
;**********************Functions********************************* | |
;**********************Insert hyperlinks in Excel********************************* | |
XL_Hyperlink_Offset_Col2(PXL,RG="",URL="",Freindly=""){ | |
For Cell in PXL.Application.ActiveSheet.Range(RG){ | |
if (Cell.offset(0,URL).value !="") | |
Cell.Value:="=Hyperlink(""" . Cell.offset(0,URL).value . """,""" . Round((Cell.Offset(0,Freindly).Value)) . """)" | |
}} | |
;**********************verify URL by pinging********************************* | |
Verify_Link(PXL,RG="",Col_Dest=""){ | |
For Cell in PXL.Application.ActiveSheet.Range(RG){ | |
url:=Cell.value | |
RegExMatch(url,"^(?P<start>.*?)(?P<end>[?|#].*)?$",URL_) ;breakout parts after URL | |
url:=RTrim(url, "/") ; trim / | |
url:=RTrim(url, "#") ; trim pound | |
if (url="") or (url="#") | |
Continue ;don't verify if missing | |
IfInString, url, javascript | |
Continue ;don't verify if javascript | |
type:="GET" | |
ComObjError(false) | |
WebRequest := ComObjCreate("WinHttp.WinHttpRequest.5.1") | |
WebRequest.Open(Type, URL_Start) | |
WebRequest.SetRequestHeader("Accept", "text/html;charset=utf-8") | |
WebRequest.SetRequestHeader("Referer",URL) ;set refering site to url | |
Cookie= | |
( | |
JSESSIONID=3F777724E42CC7EDE6FA8F37D513E038.node13; tidomain=www.ti.com; gpv_p9_o=rf430 learn nfc tab-en; s_cc=true; AP_COOKIE_EN=computerId-C_EN_286630705&geoStateCode-TX&ipGeoMapDate-1426850455286&expiryDate-1458386461914&lastVisitedDate-1426850461914&geoRegion-Americas&createdDate-1425302610354&geoCountryCode-US&ipAddress-156.117.61.214&; AB_TECHDOC_EN=%7C0%7C; AB_PREFERENCE_EN=Y; PROMO_TRACKER_EN=TM4C1230C3PM_17_en_1_2; | |
) | |
WebRequest.SetRequestHeader("Cookie", Cookie) | |
IfWinExist, Fiddler | |
WebRequest.SetProxy(2,"localhost:8888") ;turn off if Fiddler not running | |
Try { | |
WebRequest.Send() ;temporarily removed- kept having issues | |
WebRequest.WaitForResponse(Wait_For_Server) ;wait upto 5 seconds for response | |
Text:=WebRequest.StatusText | |
Status:=WebRequest.Status ;numeric value ;~ Status_Text:=WebRequest.StatusText ;text | |
} Catch { | |
Text:="error" | |
Status:="not tested" | |
} | |
Cell.offset(0,1).Value:=Text "/" Status | |
if (status ="200") | |
Cell.offset(0,1).Interior.ColorIndex := 4 ;green | |
else if (status ="need to verify manually") | |
Cell.offset(0,1).Interior.ColorIndex := 6 ;yellow | |
Else Cell.offset(0,1).Interior.ColorIndex := 3 ;green | |
} | |
ComObjError(true) | |
} | |
;**********************paste into excel********************************* | |
XL_Paste2(PXL,Dest_RG="",Paste=""){ ;1=All 2=Values 3=Comments 4=Formats 5=Formulas 6=Validation 7=All Except Borders | |
;8=Col Widths 11=Formulas and Number formats 12=Values and Number formats | |
IfEqual,Paste,1,SetEnv,Paste,-4104 ;xlPasteAll | |
IfEqual,Paste,2,SetEnv,Paste,-4163 ;xlPasteValues | |
IfEqual,Paste,3,SetEnv,Paste,-4144 ;xlPasteComments | |
IfEqual,Paste,4,SetEnv,Paste,-4122 ;xlPasteFormats | |
IfEqual,Paste,5,SetEnv,Paste,-4123 ;xlPasteFormulas | |
PXL.Application.ActiveSheet.Range(Dest_RG).PasteSpecial(Paste) | |
} | |
;**********************get IE********************************* | |
GetIE(Name="") { ; GetIE(tab_name) | |
If(Name) { | |
WinGet, winList, List, ahk_class IEFrame | |
While(winList%A_Index% && !m) { | |
n := A_Index, ErrorLevel := 0 | |
While(!ErrorLevel && !m) { | |
ControlGetText, tabText, TabWindowClass%A_Index%, % "ahk_id" winList%n% | |
If InStr(tabText, Name) | |
m := A_Index ; win hwnd = winList%n% | |
} } | |
ControlGet, hIESvr, hWnd, , Internet Explorer_Server%m%, % "ahk_id" winList%n% | |
} Else ControlGet, hIESvr, hWnd, , Internet Explorer_Server1, ahk_class IEFrame ; get Active IE | |
If Not hIESvr | |
Return | |
COM_Init() | |
DllCall("SendMessageTimeout", "Uint", hIESvr, "Uint", DllCall("RegisterWindowMessage", "str", "WM_HTML_GETOBJECT"), "Uint", 0, "Uint", 0, "Uint", 2, "Uint", 1000, "UintP", lResult) | |
DllCall("oleacc\ObjectFromLresult", "Uint", lResult, "Uint", COM_GUID4String(IID_IHTMLDocument2,"{332C4425-26CB-11D0-B483-00C04FD90119}"), "int", 0, "UintP", pdoc) | |
IID_IWebBrowserApp := "{0002DF05-0000-0000-C000-000000000046}" | |
pweb := COM_QueryService(pdoc,IID_IWebBrowserApp,IID_IWebBrowserApp), COM_Release(pdoc) | |
Return pweb | |
} | |
;**********************Time to complete********************************* | |
MStoM(ms) { ; Convert Milliseconds to a string of minutes and seconds | |
Orig := A_FormatFloat ; Store previous Float format | |
SetFormat, Float, 0.1 ; One decimal place | |
m := ms / 1000 / 60 ; minutes | |
m := SubStr(m, 1, StrLen(m)-2) ; Remove decimal - No rounding for minutes! | |
SetFormat, Float, 0.0 ; No decimals for seconds! | |
s := (ms / 1000) - (m * 60) ; subtract minutes from total seconds | |
SetFormat, Float, %Orig% ; Restore previous Float format | |
Return m . "m " . s . "s" ; Return minutes and seconds as a string | |
} | |
;**********************Duplicate and word counter********************************* | |
DuplicateFinderAndCounter(String, TopCounts) { | |
Needle := "[\W]+" ; this is the story, I beleive if you were to change someting it would be this regex, or you can use a simple split or StringReplace/RegExReplace every white space with line feed | |
String:=RegExReplace(String, Needle, "`n") ; replace all non word strings with new lines | |
;**********************remove short words & words not want to track********************************* | |
StringLower,string,string | |
Loop,parse, string, `n | |
{ | |
if StrLen(A_loopfield)=1 | |
Continue | |
if A_loopfield not in as,are,up,or,not,the,that,this,is,in,your,more,from,what,for,of,and,to,use,on,can,by,www,http,with,hi,low,high,new,index,if,id,var | |
String2.= A_Loopfield "`n" | |
} | |
string:=String2 | |
Sort, String ; sort the string | |
p:=1, needle := "im`n)^(.*)(\n\1)+`n" | |
while p:=RegExMatch(String, needle, duplicate, p+strlen(duplicate)){ ; search for consecutive same lines | |
StringReplace, s, duplicate, `n,, UseErrorLevel ; get the count of existing lines by using UseErrorLevel | |
Duplicates .= ErrorLevel A_Space duplicate1 "`n" ; add the count and the word | |
} | |
Duplicates:=trim(Duplicates, "`n") | |
Sort, Duplicates, RF SortingWithRegEx ; here we sort numerically, each either that, or we do it some other way... | |
if f := instr(Duplicates, "`n", false, 1, TopCounts) ; get for the tenth line feed, if there is at least 10 | |
Duplicates:=substr(Duplicates, 1, f) ; return the top ten, if.... | |
StringReplace, Duplicates, Duplicates, `n,|,all | |
stringtrimright,Duplicates, Duplicates, 1 | |
return, Duplicates | |
} | |
SortingWithRegEx(a1, a2) { | |
RegExMatch(a1, "(^\d+)", f1) | |
RegExMatch(a2, "(^\d+)", f2) | |
return f1 > f2 ? -1 : 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment