使用并行化处理大型Numpy python图像

# Date of images date_graph1 = [datetime.datetime(2020, 1, 21, 0, 0), datetime.datetime(2020, 2, 7, 0, 0), datetime.datetime(2020, 2, 25, 0, 0), datetime.datetime(2020, 3, 16, 0, 0), datetime.datetime(2020, 3, 26, 0, 0), datetime.datetime(2020, 3, 28, 0, 0), datetime.datetime(2020, 3, 31, 0, 0), datetime.datetime(2020, 4, 2, 0, 0), datetime.datetime(2020, 4, 5, 0, 0), datetime.datetime(2020, 4, 7, 0, 0), datetime.datetime(2020, 4, 10, 0, 0), datetime.datetime(2020, 4, 12, 0, 0), datetime.datetime(2020, 4, 17, 0, 0), datetime.datetime(2020, 4, 20, 0, 0), datetime.datetime(2020, 4, 22, 0, 0), datetime.datetime(2020, 4, 25, 0, 0), datetime.datetime(2020, 4, 27, 0, 0), datetime.datetime(2020, 5, 7, 0, 0), datetime.datetime(2020, 5, 12, 0, 0), datetime.datetime(2020, 5, 15, 0, 0), datetime.datetime(2020, 5, 17, 0, 0), datetime.datetime(2020, 5, 20, 0, 0), datetime.datetime(2020, 5, 27, 0, 0), datetime.datetime(2020, 5, 30, 0, 0), datetime.datetime(2020, 6, 1, 0, 0), datetime.datetime(2020, 6, 6, 0, 0), datetime.datetime(2020, 6, 21, 0, 0), datetime.datetime(2020, 6, 24, 0, 0), datetime.datetime(2020, 6, 26, 0, 0), datetime.datetime(2020, 6, 29, 0, 0), datetime.datetime(2020, 7, 9, 0, 0), datetime.datetime(2020, 7, 21, 0, 0), datetime.datetime(2020, 7, 29, 0, 0), datetime.datetime(2020, 7, 31, 0, 0), datetime.datetime(2020, 8, 5, 0, 0), datetime.datetime(2020, 8, 8, 0, 0), datetime.datetime(2020, 8, 20, 0, 0), datetime.datetime(2020, 8, 30, 0, 0), datetime.datetime(2020, 9, 7, 0, 0), datetime.datetime(2020, 9, 12, 0, 0), datetime.datetime(2020, 9, 14, 0, 0), datetime.datetime(2020, 9, 17, 0, 0), datetime.datetime(2020, 9, 19, 0, 0), datetime.datetime(2020, 9, 22, 0, 0), datetime.datetime(2020, 10, 19, 0, 0), datetime.datetime(2020, 11, 6, 0, 0), datetime.datetime(2020, 11, 18, 0, 0), datetime.datetime(2020, 11, 26, 0, 0), datetime.datetime(2020, 11, 28, 0, 0), datetime.datetime(2020, 12, 18, 0, 0)] # My binary mask to extract time series only for pixels where the mask == 1 image_mask = './mask_image.tiff' ds_mask =gdal.Open(image_mask) mask = np.array(ds_mask.GetRasterBand(1).ReadAsArray()) # Format dates dates_arr=[] for date in date_graph1: dates_arr.append(pd.Timestamp(date)) dates_arr = np.array(dates_arr) # Formatting of the images as arrays # liste_merged is the list of image files (GeoTiff) # liste_cld is the list of image cloud mask files (GeoTiff) imageBands=[] for image,cld in zip(liste_merged,liste_cld): ds = gdal.Open(image) ds_cld = gdal.Open(cld) # With chunks myarrayCloud = da.from_array(ds_cld.GetRasterBand(1).ReadAsArray(),chunks=(1000,1000)) myarray = da.from_array(ds.ReadAsArray(), chunks=(1,1000,1000)) # The version without chunks (give a momory error when I try to load all images) #yarrayCloud = ds_cld.GetRasterBand(1).ReadAsArray() #myarray = ds.ReadAsArray() # Set no data when cloud mask is 1 and when pixel value are < 0 (some pixels have wrong negative values) myarray2 = np.where(np.logical_and(myarrayCloud==0,myarray>=0),myarray,np.nan) # Add to the list the (10,5490,5490) array for all dates imageBands.append(myarray2) imageBands = da.stack(imageBands, axis=0) print(imageBands) print('SHAPE',imageBands.shape) #This prints : "SHAPE (50, 10, 5490, 5490)" ######## # This is a second part of code to adapt with the dask Arrays. In my original version # working with a subset of image but not working on whole images and all dates. # In my original version imageBands is not a daks array but just a list containing the 50 arrays # of size (10, 5490, 5490) # I have no idea how to begin to adapt this part i=0 # Iteration over row and columns for x in tqdm(range(0,5490)): for y in range(0,5490): #Apply the mask if mask[x,y] == 1: print ("Pixel to process") listeBand=[] # Go trhough the different images for dateIm in imageBands: listeValue=[] # Go trhough the dimensions of image (10 channel) for band in dateIm: listeValue.append(band[x,y]) listeBand.append(listeValue) # Create the dataframe with the pixel value for each channel and each date df=pd.DataFrame({"B2":[item[0] for item in listeBand], "B3":[item[1] for item in listeBand], "B4":[item[2] for item in listeBand], "B5":[item[3] for item in listeBand], "B6":[item[4] for item in listeBand], "B7":[item[5] for item in listeBand], "B8":[item[6] for item in listeBand], "B8A":[item[7] for item in listeBand], "B11":[item[8] for item in listeBand], "B12":[item[9] for item in listeBand]},index =dates_arr) # Interpolation and index value every 5 days dset = df.reindex( pd.date_range(start=datetime.datetime(df.index.min().year,1,1), end=datetime.datetime(df.index.min().year,12,31), freq='5D'), method='ffill').interpolate(method='time')#rolling("10D").sum() # Remove values < 0 dset[dset < 0] = 0 # Transpose dset = dset.transpose() # Row names dset['channels']='B2','B3','B4','B5','B6','B7','B8','B8A','B11','B12' cols = list(dset.columns) cols = [cols[-1]] + cols[:-1] dset = dset[cols] # Export to CSV dset.to_csv('/data/tests/CSV/file%s.csv'%i, sep=',',index=False) print ("pixel n°: %s \n"%i) i+=1

1条回答

网友

1楼 · 发布于 2024-06-16 12:45:20

我对此做了一个快速的尝试——尽管我以前从未使用过GeoTIFF、GDAL或Rasterio——所以可能有更好的方法

为了简单起见，我制作了50份一天的数据副本，称为day0.tif、day1.tif等。我还硬编码了图像的高度和宽度

它的工作原理是一次处理一个波段，然后在每个波段中，我将50天的数据堆叠成块，32行高，图像的全宽。然后我沿着日轴，即最后一个轴，进行长度为5的1D卷积

#!/usr/bin/env python3

import os
import rasterio
from rasterio.windows import Window
import numpy as np
from scipy.ndimage import convolve1d

def GetData(band, day, row, nRows):
   # https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html#windowrw
   with rasterio.open(f'day{day}.tif') as src:
      d = src.read(band,window=Window(0, row, 5490, nRows))
   # Note that you may get fewer rows than expected on the last tile
   return d

# Hard-coded for simplicity
rows, cols = 5490, 5490
rowsPer    = 32          # Number of rows we process at a time
days       = 50

print(f'pid: {os.getpid()} (for finding in "top")')

# Smoothing kernel - taking a rolling mean of N samples
kernel = np.ones(5)
kernel /= kernel.sum()
print(f'Kernel: {kernel}')

for band in range(1,11):
   print(f'DEBUG: Processing band {band}')
   for row in range(0,rows,rowsPer):
      #print(f'DEBUG: Processing row {row}')
      stackOfDays = np.zeros((rowsPer,cols,days),np.uint16)
      for day in range(0,days):
          d = GetData(band, day, row, rowsPer)
          # Stack into stackOfDays
          stackOfDays[:d.shape[0],:,day] = d
      # Convolve along last axis, i.e. along days axis
      smoothed = convolve1d(stackOfDays, kernel, mode='nearest', axis=-1)

它在我的MacBookPro上以900秒的时间在50天的数据中运行10个波段的整个数据集。我没有保存任何输出结果，因为不清楚您到底想要什么-尽管在每个乐队的结尾写一些TIFF几乎不会改变时间

我想如果900秒太长的话，你可以多个进程，每个进程做一个波段。当然，你可以随便看看我的尺码，看看它是否更适合你的机器

您可以在进程运行时监视它，以查看它至少在macOS上使用了多少RAM，或者您可以使用top或htop：

/usr/bin/time -l script.py

相关问题更多 >

编程相关推荐

热门问题

热门文章