The Code
import json
import urllib.request, urllib.parse, urllib.error
import re
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
sourceDict = {
"Company name in Japanese 1": "https://www.xxxxxxx.co.jp/",
"Company name in Japanese 2": "https://www.xxxxxxx.co.jp/",
"Company name in Japanese 3": "https://www.xxxxxxx.co.jp/",
}
def lambda_handler(event, context):
thiskeyWord = event.get('keyWord')
resultDict = {}
for comKey,value in sourceDict.items():
#print(comKey)
mother = {}
resultDict[comKey] = {}
resultDict[comKey][comKey+";"+"openedcount" ] = 0
resultDict[comKey][comKey+";"+"totalhits" ] = 0
hitpagepool = { }
try:
html = urllib.request.urlopen(value, context=ctx).read()
except:
print("Failed to open the website: ", comKey)
continue
try:
htmluni = html.decode()
except:
print("Failed to decode: ", comKey)
continue
resultDict[comKey][comKey+";"+"openedcount" ] += 1
findDump = re.findall(thiskeyWord, htmluni)
numHit = len(findDump)
if numHit > 0:
resultDict[comKey][comKey+";"+"totalhits" ] += numHit
hitpagepool[comKey+";"] = numHit
homelist1 = re.findall('a href="('+value+'.*/)"', htmluni)
if len(homelist1) > 0:
for link in homelist1:
mother[link] = 0
homelist2 = re.findall('<a href="/(.*)"', htmluni)
if len(homelist2) > 0:
for link in homelist2:
link = value + link
link = link.split('"')
link = link[0:1]
str1 = ""
for ele in link:
str1 += ele
link = str1
mother[link] = 0
homelist3 = re.findall('<a href="(.*)"', htmluni)
if len(homelist3) > 0:
for link in homelist3:
if link.startswith('http'):
continue
if link.startswith('/'):
continue
link = value + link
link = link.split('"')
link = link[0:1]
str1 = ""
for ele in link:
str1 += ele
link = str1
mother[link] = 0
#Remove www
nohome = value.split('/')
host = nohome[2:3]
str1 = ""
for ele in host:
str1 += ele
host = str1
host = host[4:]
nohome[2] = host
nohome[1] = '//'
nohome[3] = '/'
hostonly = ''.join(nohome)
homelist4 = re.findall('a href="('+hostonly+'.*/)"', htmluni)
if len(homelist4) > 0:
for link in homelist4:
mother[link] = 0
child1 = list(mother)
# Layer Number 2 #####
child2 = list()
for k in child1:
try:
dumped2 = urllib.request.urlopen(k, context=ctx).read()
except:
continue
resultDict[comKey][comKey+";"+"openedcount" ] += 1
try:
dumped2 = dumped2.decode()
except:
continue
findDump = re.findall(thiskeyWord, dumped2)
numHit = len(findDump)
if numHit > 0:
resultDict[comKey][comKey+";"+"totalhits" ] += numHit
hitpagepool[k+";"] = numHit
if resultDict[comKey][comKey+";"+"openedcount" ] > 200:
break
seclist1 = re.findall('a href="('+value+'.*/)"', dumped2)
if len(seclist1) > 0:
for link in seclist1:
if link not in mother:
mother[link] = 0
child2.append(link)
num = len(value)
k2 = k[num:]
seclist2 = re.findall('<a href="/'+k2+'(.*)"', dumped2)
if len(seclist2) > 0:
for link in seclist2:
link = k + link
link = link.split('"')
link = link[0:1]
str1 = ""
for ele in link:
str1 += ele
link = str1
if link not in mother:
mother[link] = 0
child2.append(link)
seclist3 = re.findall('<a href="(.*)"', dumped2)
if len(seclist3) > 0:
for link in seclist3:
if link.startswith('http'):
continue
if link.startswith('/'):
continue
link = k + link
link = link.split('"')
link = link[0:1]
str1 = ""
for ele in link:
str1 += ele
link = str1
if link not in mother:
mother[link] = 0
child2.append(link)
seclist4 = re.findall('a href="('+hostonly+'.*/)"', dumped2)
if len(seclist4) > 0:
for link in seclist4:
if link not in mother:
mother[link] = 0
child2.append(link)
# Layer Number 3 ###
child3 = list()
for k in child2:
try:
dumped3 = urllib.request.urlopen(k, context=ctx).read()
except:
continue
resultDict[comKey][comKey+";"+"openedcount" ] += 1
try:
dumped3 = dumped3.decode()
except:
continue
findDump = re.findall(thiskeyWord, dumped3)
numHit = len(findDump)
if numHit > 0:
resultDict[comKey][comKey+";"+"totalhits" ] += numHit
hitpagepool[k+";"] = numHit
if resultDict[comKey][comKey+";"+"openedcount" ] > 200:
break
try:
mosthitpageURL = max(hitpagepool, key=lambda key: hitpagepool[key])
#print(comKey, "has the most hit page of ", mosthitpageURL)
except:
mosthitpageURL = "No most hit page."
hitpagepool[mosthitpageURL] = 0
mosthitNum = hitpagepool[mosthitpageURL]
resultDict[comKey] [comKey+";"+"mosthitpage" ] = comKey+";"+ mosthitpageURL + ";" + str(mosthitNum)
#print(hitpagepool)
return(resultDict)