
2024-06-01 02:40:26 发布

您现在位置:Python中文网/ 问答频道 /正文


memory usage


def checkMultipleLine(dbName):
    Checks for rows that contain data spanning multiple lines

    This is the most basic of checks. If a aprticular row has 
    data that spans multiple lines, then that particular row
    is corrupt. For dealing with these rows we must first find 
    out whether there are places in the database that contains
    data that spans multiple lines. 

    logger = logging.getLogger('mindLinc.checkSchema.checkMultipleLines')
    logger.info('Finding rows that span multiple lines')

    schema = findTables(dbName)

    results = []
    for t in tqdm(sorted(schema.keys())):

        conn = psycopg2.connect("dbname='%s' user='postgres' host='localhost'"%dbName)
        cur  = conn.cursor()
        cur.execute('select * from %s'%t)
        n = 0
        N = 0
        while True:
            css = cur.fetchmany(1000)
            if css == []: break
            for cs in css:
                N += 1
                if any(['\n' in c for c in cs if type(c)==str]):
                    n += 1

        tqdm.write('[%40s] -> [%5d][%10d][%.4e]'%(t, n, N, n/(N+1.0)))
            'tableName': t,
            'totalRows': N,
            'badRows'  : n,

    logger.info('Finished checking for multiple lines')

    results = pd.DataFrame(results)[['tableName', 'badRows', 'totalRows']]
    print results
    results.to_csv('error_MultipleLine[%s].csv'%(dbName), index=False)

    return results

Tags: infordataifthatpostgresmultiplelogger