Advanced Data Analytics Using Python_ With Machine Learning, Deep Learning and NLP Examples ( 2023)
Create successful ePaper yourself
Turn your PDF publications into a flip-book with our unique Google optimized e-Paper software.
Chapter 2
ETL with Python (Structured Data)
def process_pdf_link(link):
html = urllib2.urlopen(link)
file = open("document.pdf", 'w')
file.write(html.read())
file.close()
return get_email_ph("document.pdf", pdf=True)
def process_doc_link(link):
testfile = urllib.URLopener()
testfile.retrieve(link, "document.doc")
return get_email_ph("document.doc", pdf=False)
def process_docx_link(link):
testfile = urllib.URLopener()
testfile.retrieve(link, "document.docx")
return get_email_ph("document.docx", pdf=False)
def process_links(all_links):
with open('email_ph.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
for link in all_links:
if link[:4] !='http':
link = "http://"+link
print link
try:
if link[-3:] == 'pdf':
try:
email, ph = process_pdf_link(link)
spamwriter.writerow([link, ' '.join(email), ' '.join(ph)])
except:
print "error",link
print sys.exc_info()
46