Skip to content

Add http if missing #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/Email-Scraping.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

143 changes: 122 additions & 21 deletions EmailScraping.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#! python3
import concurrent
import re, urllib.request, time
from urllib.parse import urlparse, urljoin

from bs4 import BeautifulSoup
from loguru import logger
from concurrent.futures import ThreadPoolExecutor

emailRegex = re.compile(r'''
#example :
Expand All @@ -12,7 +18,7 @@
''', re.VERBOSE)

#Extacting Emails
def extractEmailsFromUrlText(urlText):
def extractEmailsFromUrlText(urlText) -> set[str]:
extractedEmail = emailRegex.findall(urlText)
allemails = []
for email in extractedEmail:
Expand All @@ -23,52 +29,147 @@ def extractEmailsFromUrlText(urlText):
for email in allemails:
if email not in seen: # faster than `word not in output`
seen.add(email)
emailFile.write(email+"\n")#appending Emails to a filerea
return seen

#HtmlPage Read Func
def htmlPageRead(url, i):
def htmlPageRead(url, i) -> set[str]:
try:
logger.info(f"Reading HTML Page {url}")
start = time.time()
headers = { 'User-Agent' : 'Mozilla/5.0' }
request = urllib.request.Request(url, None, headers)
logger.info(f"Requesting {url}")
response = urllib.request.urlopen(request)
logger.info(f"Response from {url}")
urlHtmlPageRead = response.read()
urlText = urlHtmlPageRead.decode()
print ("%s.%s\tFetched in : %s" % (i, url, (time.time() - start)))
extractEmailsFromUrlText(urlText)
return extractEmailsFromUrlText(urlText)
except:
pass

#EmailsLeechFunction
def emailsLeechFunc(url, i):

def emailsLeechFunc(url, i) -> set[str]:
try:
htmlPageRead(url,i)
return htmlPageRead(url,i)
except urllib.error.HTTPError as err:
if err.code == 404:
try:
logger.info(f"Fetching Cached Page {url}")
url = 'http://webcache.googleusercontent.com/search?q=cache:'+url
htmlPageRead(url, i)
return htmlPageRead(url, i)
except:
logger.info(f"Error in fetching Cached Page {url}")
pass
else:
logger.info(f"Error in fetching {url}")
pass

# TODO: Open a file for reading urls
start = time.time()
urlFile = open("urls.txt", 'r')
emailFile = open("emails.txt", 'a')
i=0
#Iterate Opened file for getting single url
for urlLink in urlFile.readlines():
urlLink = urlLink.strip('\'"')
i=i+1
emailsLeechFunc(urlLink, i)
print ("Elapsed Time: %s" % (time.time() - start))
def add_http_if_missing(url):
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://' + url
return url

def fetch_internal_links(url) -> set[str]:
try:
logger.info(f"Fetching Internal Links from {url}")
headers = { 'User-Agent' : 'Mozilla/5.0' }
request = urllib.request.Request(url, None, headers)
logger.info(f"Requesting {url}")
response = urllib.request.urlopen(request)
logger.info(f"Response from {url}")
soup = BeautifulSoup(response, 'html.parser')
internal_links = set()

excluded_domains = ['onedio', 'tiktok', 'twitter', 'facebook', 'instagram', 'linkedin', 'youtube', 'reddit', 'google', 'yandex']

urlFile.close()
emailFile.close()
for link in soup.find_all('a'):
href = link.get('href')
if href:
href = urljoin(url, href) # Join the URL with the href
href_domain = urlparse(href).netloc # Get domain of the href
# Only add href to internal_links if its domain matches the URL's domain,
# and it does not belong to the excluded domains
if not any(excluded in href_domain for excluded in excluded_domains):
internal_links.add(href)
logger.info(f"Found {len(internal_links)} internal links.")
return internal_links
except Exception as e:
logger.error(e)
return set()

emails = set()
all_urls = set()
def run_program(urlfile, emailfile):
urls = set()
for urlLink in urlfile.readlines():
urlLink = urlLink.strip('\'"')
urlLink = add_http_if_missing(urlLink)
urls.add(urlLink)
logger.info(f"urls: {urls}")

with ThreadPoolExecutor(max_workers=20) as executor:
futures = {executor.submit(fetch_internal_links, url): url for i, url in enumerate(urls)}

def callback(future):
try:
temp = future.result(timeout=5)
for url in temp:
all_urls.add(url)
except concurrent.futures.TimeoutError:
logger.warning(f'Timeout: {temp} took too long to complete.')
except Exception as exc:
logger.error(f'{temp} generated an exception: {exc}')
if all(f.done() for f in futures):
logger.info("All futures are done for internal links.")
for url in all_urls:
emailfile.write(f"{url}\n")
# fetch_emails_with_depth(all_urls, emailfile)

for future in futures:
future.add_done_callback(callback)

current_index = 1
def fetch_emails_with_depth(urls, emailfile):
try:
with ThreadPoolExecutor(max_workers=15) as executor:
futures = {executor.submit(emailsLeechFunc, url, i): url for i, url in enumerate(urls)}

def callback(future):
global current_index
url = futures[future]
try:
# Add a timeout of 5 seconds
fetched = future.result(timeout=5) # get the result of the function
for email in fetched:
current_index = current_index + 1
emails.add(email)
logger.info(f"Total emails found: {len(emails)}")
except concurrent.futures.TimeoutError:
logger.warning(f'Timeout: {url} took too long to complete.')
except Exception as exc:
logger.error(f'{url} generated an exception: {exc}')
if all(f.done() for f in futures):
logger.info("All futures are done.")
# Write all to emails.txt
for email in emails:
emailfile.write(f"{email}\n")

for future in futures:
future.add_done_callback(callback)
except Exception as e:
logger.error(e)

if __name__ == "__main__":
start = time.time()
urlFile = open("urls.txt", 'r')
emailFile = open("emails.txt", 'a')
try:
run_program(urlFile, emailFile)
except Exception as e:
logger.error(e)
finally:
urlFile.close()
emailFile.close()
print("Elapsed Time: %s" % (time.time() - start))

Loading