Dear Group, I am trying to write the following script to call around 0.3 million files from a remote server. It is generally working fine, but could work only upto 65 to 70 files. After this, it is just printing the file names and not processing anything. If anyone may kindly suggest what I am doing wrong?
import pymysql import pymysql.cursors import os import win32com.client from gensim.models import Word2Vec import nltk from nltk.corpus import stopwords import pyPdf from pyth.plugins.rtf15.reader import Rtf15Reader from pyth.plugins.plaintext.writer import PlaintextWriter import nltk import zipfile, re import time #READING ONE DOC FILE FROM REMOTE LOCATION def readfilesq9(n): connection = pymysql.connect(host='xxx.xxx.x.xxx', user='abcd', passwd='pwd1', db='rep_db', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) list1=[] with connection.cursor() as cursor: # Read a single record sql = "SELECT candidateid,cnd.FirstName, cnd.LastName,Concat('\\xxx.xxx.x.xxx\File\Cand_Res/',orgguid,'/',DATE_FORMAT(cnd.createddate,'%Y%m'),'/',candidateguid,'/',Resume) as ResumePath from candidate cnd join mstorganization org on cnd.orgid = org.OrgId where Resume <> '' and Resume is not null order by cnd.modifieddate limit 100000" cursor.execute(sql) result = cursor.fetchall() #print result #list1=[] for i in result: try: #print i item_1=i.items() item_2=item_1[2][1] print item_2 item_3=item_2.index("/") file1=item_2[item_2:] string1='\\\\xxx.xxx.x.xxx\\Resumes\\Cand_Res' file1e=file1.encode('ascii', 'ignore') urls=file1e.replace("/","\\") file_full=string1+urls time.sleep(1) #osp="C:\\Python27" os1=os.path.abspath(os.curdir) osp2=os.path.join(os1,file_full) print "Path1:",osp2 file_name1=osp2 print "Path:",file_name1 #IDENTIFICATION OF FILE KIND #DOC CONVERSION if ".doc" in file_name1: #EXTRACTING ONLY .DOC FILES if ".docx" not in file_name1: #print "It is A Doc file$$:",file_name try: doc = win32com.client.GetObject(file_name1) text = doc.Range().Text text1=text.encode('ascii','ignore') text_word=text1.split() #print "The Text Word is:",text_word #print "Text for Document File Is:",text1 list1.append(text_word) #print "List for Doc File Is:",list3 #print "It is a Doc file" except: print "DOC ISSUE" #EXTRACTING ONLY .DOCX FILES elif ".docx" in file_name1: #print "It is DOCX FILE:",file_name docx1=zipfile.ZipFile(file_name1) content = docx1.read('word/document.xml').decode('utf-8') cleaned = re.sub('<(.|\n)*?>','',content).encode('ascii','ignore') cleaned_word=cleaned.split() #print "The Cleaned Document Is:",cleaned list1.append(cleaned_word) #print "List for DocX file Is:",list4 else: print "NONE1" else: print "It is not a Doc file" except: print "OOPS1" I am using Python2.7.6 on Enthought Canopy. It is not my default Python. My default Python is in location,"C:\\Python27". I am using MySql and Windows 7 Professional. Apology for any indentation error. Regards, Subhabrata Banerjee. -- https://mail.python.org/mailman/listinfo/python-list