mysql - Error using utf-8 filenames in python script -
i have seemingly impossible conundrum , hope guys can point me in right direction. have been coming , leaving project weeks , think time solve it, hopefully.
i making script supposed read bunch of .xls excel files directory structure, parse contents , load mysql database. now, in main function, list of (croatian) file names gets passed xlrd, , problem lies.
the environment date freebsd 9.1.
i following error when executing script:
mars:~/20130829> python megascript.py python version: 2.7.5 filesstem encoding is: utf-8 removing error.log if exists... doesn't. done! connecting database... done! mysql database version: 5.6.13 loading pilots... done! loading tehnicians... done! loading aircraft registrations... done! loading file list... done! processing files... /2006/1_siječanj.xls traceback (most recent call last): file "megascript.py", line 540, in <module> main() file "megascript.py", line 491, in main data = readxlsfile(files, 'upis', piloti, tehnicari, helikopteri) file "megascript.py", line 129, in readxlsfile workbook = open_workbook(f) file "/usr/local/lib/python2.7/site-packages/xlrd-0.9.2-py2.7.egg/xlrd/__init__.py", line 394, in open_workbook f = open(filename, "rb") ioerror: [errno 2] no such file or directory: u'/2006/1_sije\u010danj.xls'
i have included complete output make code fow easier follow. suppose problem in xlrd not accepting utf-8 file list. i'm not sure how around without messing around xlrd code though. ideas?
here goes code:
#! /usr/bin/env/python # -#*- coding: utf-8 -*- import os, sys, getopt, codecs, csv, mysqldb, platform mmap import mmap,access_read xlrd import open_workbook, xldate_as_tuple # define constants nalet_out = '' putnici_out = '' db_host = 'localhost' db_user = 'user' db_pass = 'pass' db_database = 'eth' start_dir = u'./' error_file = start_dir + 'mega_error.log' # functions def isnumber(s): # check if string number try: float(s) return true except valueerror: return false def getmonth(f): # izvuci mjesec iz imena datoteke u formatu "1_sijecanj.xls" temp = os.path.basename(f) temp = temp.split('_') mjesec = int(temp[0]) return mjesec def getyear(f): # izvuci godinu iz path f = f.split('/') godina = f[-2] return godina def databaseversion(cur): # print mysql database version try: cur.execute("select version()") result = cur.fetchone() except mysqldb.error, e: try: print "mysql error [%d]: %s]" % (e.args[0], e.args[1]) except indexerror: print "mysql error: %s" % (e.args[0], e.args[1]) print "mysql database version: %s" % result def getquery(cur, sql_query): # perform passed query on passed database try: cur.execute(sql_query) result = cur.fetchall() except mysqldb.error, e: try: print "mysql error [%d]: %s]" % (e.args[0], e.args[1]) except indexerror: print "mysql error: %s" % (e.args[0], e.args[1]) return result def getfiles(): files = [] # find subdirectories in [x[0] x in os.walk(start_dir)]: if (i != '.' , isnumber(os.path.basename(i))): # find files in subdirectories j in [y[2] y in os.walk(i)]: # every file in file list y in j: fn, fe = os.path.splitext(y) is_mj = fn.split("_") if(fe == '.xls' , y.find('_') , isnumber(is_mj[0])): mj = fn.split('_') files.append(i.lstrip('.') + "/" + y) # sort list cronologically files.sort(key=lambda x: getmonth(x)) files.sort(key=lambda x: getyear(x)) return files def errhandle(f, datum, var, vrijednost, ispravka = "null"): # error information, print on screen , write error.log f = unicode(str(f), 'utf-8') datum = unicode(str(datum), 'utf-8') var = unicode(str(var), 'utf-8') try: vrijednost = unicode(str(vrijednost.decode('utf-8')), 'utf-8') except unicodeencodeerror: vrijednost = vrijednost ispravka = unicode(str(ispravka), 'utf-8') err_f = codecs.open(error_file, 'a+', 'utf-8') line = f + ": " + datum + " " + var + "='" + vrijednost\ + "' ispravka='" + ispravka + "'" #print "%s" % line err_f.write(line) err_f.close() def readxlsfile(files, sheet, piloti, tehnicari, helikopteri): # read xls file , return list of rows data = [] nalet = [] putn = [] id_index = 0 # every file in list f in files: print "%s" % f temp = f.split('/') godina = str(temp[-2]) temp = os.path.basename(f).split('_') mjesec = str(temp[0]) workbook = open_workbook(f) sheet = workbook.sheet_by_name('upis') # every row doesn't contain '' or 'posada' or 'dan' etc... ri in range(sheet.nrows): if sheet.cell(ri,1).value!=''\ , sheet.cell(ri,2).value!='posada'\ , sheet.cell(ri,1).value!='dan'\ , (sheet.cell(ri,2).value!=''): temp = sheet.cell(ri, 1).value temp = temp.split('.') dan = temp[0] # datum datum = "'" + godina + "-" + mjesec + "-" + dan + "'" # kapetan kapetan = '' kapi='' if sheet.cell(ri, 2).value == "": kapetan = "null" else: kapetan = sheet.cell(ri, 2).value if kapetan[-1:] == " ": errhandle(f, datum, 'kapetan', kapetan, kapetan[-1:]) kapetan = kapetan[:-1] if(kapetan): try: kapi = [x[0] x in piloti if x[2].lower() == kapetan] kapi = kapi[0] except valueerror: errhandle(f, datum, 'kapetan', kapetan, '') kapetan = '' except indexerror: errhandle(f, datum, 'kapetan', kapetan, '') kapi = 'null' else: kapi="null" # kopilot kopilot = '' kopi = '' if sheet.cell(ri, 3).value == "": kopi = "null" else: kopilot = sheet.cell(ri, 3).value if kopilot[-1:] == " ": errhandle(f, datum,'kopilot', kopilot,\ kopilot[:-1]) if(kopilot): try: kopi = [x[0] x in piloti if x[2].lower() == kopilot] kopi = kopi[0] except valueerror: errhandle(f, datum,'kopilot', kopilot, '') except indexerror: errhandle(f, datum, 'kopilot', kopilot, '') kopi = 'null' else: kopi="null" # teh 1 teh1 = '' t1i = '' if sheet.cell(ri, 4).value=='': t1i = 'null' else: teh1 = sheet.cell(ri, 4).value if teh1[-1:] == " ": errhandle(f, datum,'teh1', teh1, teh1[:-1]) teh1 = 'null' if(teh1): try: t1i = [x[0] x in tehnicari if x[2].lower() == teh1] t1i = t1i[0] except valueerror: errhandle(f, datum,'teh1', teh1, '') except indexerror: errhandle(f, datum, 'teh1', teh1, '') t1i = 'null' else: t1i="null" # teh 2 teh2='' t2i='' if sheet.cell(ri, 5).value=='': t2i = "null" else: teh2 = sheet.cell(ri, 5).value if teh2[-1:] == " ": errhandle(f, datum,'teh2', teh2, teh2[-1:]) teh2 = '' if(teh2): try: t2i = [x[0] x in tehnicari if x[2].lower() == teh2] t2i = t2i[0] except valueerror: errhandle(f, datum,'teh2', teh2, 'null') t2i = 'null' except indexerror: errhandle(f, datum,'teh2', teh2, 'null') t2i = 'null' else: t2i="null" # oznaka oznaka = '' heli = '' if sheet.cell(ri, 6).value=="": oznaka = errhandle(f, datum, "helikopter", oznaka, "") else: oznaka = str(int(sheet.cell(ri, 6).value)) try: heli = [x[0] x in helikopteri if x[0] == oznaka] except valueerror: errhandle(f, datum, 'helikopter', oznaka, '') except indexerror: errhandle(f, datum, 'helikopter', oznaka, '') heli = '' # uvjeti uvjeti = sheet.cell(ri, 9).value # letova letova_dan = 0 letova_noc = 0 letova_ifr = 0 letova_sim = 0 if sheet.cell(ri, 7).value == "": errhandle(f, datum, 'letova', letova, '') else: letova = str(int(sheet.cell(ri, 7).value)) if uvjeti=="vfr": letova_dan = letova elif uvjeti=="ifr": letova_ifr = letova elif uvjeti=="sim": letova_sim = letova else: letova_noc = letova #block time bt_dan = "'00:00:00'" bt_noc = "'00:00:00'" bt_ifr = "'00:00:00'" bt_sim = "'00:00:00'" try: bt_tpl = xldate_as_tuple(sheet.cell(ri, 8).value, workbook.datemode) bt_m = bt_tpl[4] bt_h = bt_tpl[3] bt = "'" + str(bt_h).zfill(2)+":"+str(bt_m)+":00'" except valueerror or indexerror: errhandle(f, datum, 'bt', sheet.cell(ri,8).value, '') if uvjeti[:3]=="vfr": bt_dan = bt elif uvjeti[:3]=="ifr": bt_ifr = bt elif uvjeti[:3]=="sim": bt_sim = bt elif uvjeti[:2] == "no": bt_noc = bt else: errhandle(f, datum, 'uvjeti', uvjeti, '') # vrsta leta vrsta = "'" + sheet.cell(ri, 10).value + "'" # vjezba vjezba = 'null'; try: vjezba = sheet.cell(ri, 11).value if vjezba == '': # many results #errhandle(f, datum, 'vjezba', vjezba, '') vjezba = 'null' if vjezba == "?": errhandle(f, datum, 'vjezba', str(vjezba), '') vjezba = 'null' if str(vjezba) == 'i': errhandle(f, datum, 'vjezba', str(vjezba), '') vjezba = 'null' if str(vjezba)[-1:] == 'i': errhandle(f, datum, 'vjezba', str(vjezba),\ str(vjezba).rstrip('i')) vjezba = str(vjezba).rstrip('i') if str(vjezba).find(' ') != -1: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split(' ')[0]) vjezba = str(vjezba).split(' ') vjezba = vjezba[0] if str(vjezba)[-1:] == 'm': errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).rstrip('m')) vjezba = str(vjezba).rstrip('m') if str(vjezba).find(';') != -1: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split(';')[0]) temp = str(vjezba).split(';') vjezba = temp[0] if str(vjezba).find('/') != -1: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split('/')[0]) temp = str(vjezba).split('/') vjezba = temp[0] if str(vjezba).find('-') != -1: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split('-')[0]) temp = str(vjezba).split('-') vjezba = temp[0] if str(vjezba).find(',') != -1: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split(',')[0]) temp = str(vjezba).split(',') vjezba = temp[0] if str(vjezba).find('_') != -1: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split('_')[0]) temp = str(vjezba).split('_') vjezba = temp[0] if str(vjezba) == 'bo': errhandle(f, datum, 'vjezba', str(vjezba), '') vjezba = 'null' if str(vjezba).find(' ') != -1: if str(vjezba) == 'pp 300': errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split(' ')[1]) temp = str(vjezba).split(' ') vjezba = temp[1] else: errhandle(f, datum, 'vjezba', str(vjezba), str(vjezba).split(' ')[0]) temp = str(vjezba).split(' ') vjezba = temp[0] if str(vjezba) == 'pp': errhandle(f, datum, 'vjezba', str(vjezba), '') vjezba = '' except unicodeencodeerror: errhandle(f, datum, 'unicode error! vjezba', vjezba, '') if vjezba != 'null': vjezba = int(float(vjezba)) # visinska slijetanja # putnici vp1 = str(sheet.cell(ri, 12).value) bp1 = str(sheet.cell(ri, 13).value) vp2 = str(sheet.cell(ri, 14).value) bp2 = str(sheet.cell(ri, 15).value) # teret teret = '' teret = str(sheet.cell(ri, 16).value) if teret == '': teret = 0 # baja baja = '' if sheet.cell(ri, 17).value == '': baja = 0 else: baja = int(sheet.cell(ri, 17).value) / 2 # dodano /2 da se dobiju tone # redosljed csv id_index = id_index + 1 row = [id_index, datum, kapi, kopi, t1i, t2i, oznaka,\ letova, letova_dan, letova_noc, letova_ifr,\ letova_sim, bt, bt_dan, bt_noc, bt_ifr,\ bt_sim, vrsta, vjezba, teret, baja] row = [str(i) in row] nalet.append(row) putn = [] if bp1 != '': put = [id_index, vp1, bp1] putn.append(put) if bp2 != '': put = [id_index, vp2, bp2] putn.append(put) data.append(nalet) data.append(putn) return data def main(): # python version print "\npython version: %s \n" % platform.python_version() # print filesystem encoding print "filesstem encoding is: %s" % sys.getfilesystemencoding() # remove error file if exists print "removing error.log if exists..." try: os.remove(error_file) print "it did." except oserror: print "it doesn't." pass print "done!" # connect database print "connecting database..." db = mysqldb.connect(db_host, db_user, db_pass, db_database,\ use_unicode=true, charset='utf8') cur=db.cursor() print "done!" # database version databaseversion(cur) # load pilots, tehnicians , helicopters db print "loading pilots..." sql_query = "select eth_osobnici.id, eth_osobnici.ime,\ eth_osobnici.prezime eth_osobnici right join \ eth_letacka_osposobljenja on eth_osobnici.id=\ eth_letacka_osposobljenja.id_osobnik \ eth_letacka_osposobljenja.vrsta_osposobljenja='kapetan' \ or eth_letacka_osposobljenja.vrsta_osposobljenja='kopilot'" #piloti = [] #piloti = getquery(cur, sql_query) piloti=[] temp = [] temp = getquery(cur, sql_query) row in temp: piloti.append(row) print "done!" print "loading tehnicians..." sql_query = "select eth_osobnici.id, eth_osobnici.ime,\ eth_osobnici.prezime eth_osobnici right join \ eth_letacka_osposobljenja on eth_osobnici.id=\ eth_letacka_osposobljenja.id_osobnik \ eth_letacka_osposobljenja.vrsta_osposobljenja='tehničar 1' \ or eth_letacka_osposobljenja.vrsta_osposobljenja='tehničar 2'" tehnicari=[] temp = [] temp = getquery(cur, sql_query) row in temp: tehnicari.append(row) print "done!" print "loading aircraft registrations..." sql_query = "select id eth_helikopteri" helikopteri=[] temp = [] temp = getquery(cur, sql_query) row in temp: helikopteri.append(row) print "done!" # file names process print "loading file list..." files = getfiles() print "done!" # process files array print "processing files..." data = readxlsfile(files, 'upis', piloti, tehnicari, helikopteri) print "done!" # enter new information in database result = 0 print "reseting database..." sql_query = "delete eth_nalet" cur.execute(sql_query) db.commit() sql_query = "alter table eth_nalet auto_increment=0" cur.execute(sql_query) db.commit() print "done!" print "loading data in 'eth_nalet'..." row in data[0]: sql_query = """insert eth_nalet (id, datum, kapetan, kopilot, teh1, teh2, registracija, letova_uk, letova_dan, letova_noc, letova_ifr, letova_sim, block_time, block_time_dan, block_time_noc, block_time_ifr, block_time_sim, vrsta_leta, vjezba, teret, baja) values (%s)""" % (", ".join(row)) cur.execute(sql_query) db.commit() print "done!" print "loading data in 'eth_putnici'..." row in data[1]: sql_query = """insert eth_putnici (id_leta, vrsta_putnika, broj_putnika) values (%s)""" % (", ".join(row)) cur.execute(sql_query) db.commit() print "done!" # close database connection print "closing database connection..." if cur: cur.close() if db: db.close() print "database closed!" if __name__ == '__main__': main()
i apologize not translating comments in code, old project of mine , tend make comments in english now. if needs explanation please fire away.
the funny thing if print file list screen, display fine. when passed xlrd don't seem in right format.
respectfully, me
i managed find error! wasn't due encoding error after all. logic error.
in function getfiles() stripped leading "." file list, , didn't strip "./" ought to. so, naturally file names "/2006/1_siječanj.xls" instead of "2006/1_siječanj.xls" should be. ioerror , not not unicodeencodeerror. , result of oversight script tried find absolute path instead of relative path.
well embarrassing. thank guys, hope post helps else pay more attention error types python throws @ us.
Comments
Post a Comment