精华内容
下载资源
问答
  • 这个是Kettle一个转换的脚本,可以通过这个demo结合java代码来进行数据清洗
  • python脚本清洗数据

    2019-06-26 14:12:29
    1,目标检测试集挑选,已经有VOC格式的JPEGImages,Annotations,和testval.txt列表 JPEGImages/62F1DBDB-4798-E8A5-4FA5-14C9E19ADA5B.jpg Annotations/62F1DBDB-4798-E8A5-4FA5-14C9E19ADA5B.xml ...

    1,目标检测试集挑选,已经有VOC格式的JPEGImages,Annotations,和testval.txt列表

    JPEGImages/62F1DBDB-4798-E8A5-4FA5-14C9E19ADA5B.jpg Annotations/62F1DBDB-4798-E8A5-4FA5-14C9E19ADA5B.xml
    JPEGImages/2C042431-5CBE-9D69-646C-B5930204E15B.jpg Annotations/2C042431-5CBE-9D69-646C-B5930204E15B.xml

    生成导出img和对应的label-C X1 Y1 W H

    import os
    import cv2
    from PIL import Image
    from random import randint
    import xml.etree.ElementTree as ET
    
    names = ["A","B","C"]
    
    def drawRealBox(xml, w, h):
        xmlfile=os.path.join("./Annotations", xml)
        print xmlfile
        txtfile=os.path.join("./labels", xml.replace('xml','txt'))
        f = open(txtfile,'w')
        tree = ET.parse(xmlfile)
        objs = tree.findall('object')
        for obj in objs:
            box = obj.find('bndbox')
            name = obj.find('name').text
    
            x1=int(box.find('xmin').text)
            y1=int(box.find('ymin').text)
            x2=int(box.find('xmax').text)
            y2=int(box.find('ymax').text)
    	
            cx = (x1)/float(w)
            cy = (y1)/float(h)
            cw = (x2-x1)/float(w)
            ch = (y2-y1)/float(h)
            index = names.index(name)
            f.write("%d %f %f %f %f\n" % (index, cx,cy,cw,ch))
        f.close()
    
    allimg = os.listdir('./JPEGImages')
    file_ynh = open("./testval.txt",'r')
    lines = file_ynh.readlines()
    for line in lines:
        line.replace("\n","")
        txtpath = line.split(' ')[0]
        txtimg = txtpath.split('/')[-1]
        jpg = cv2.imread(os.path.join('./JPEGImages', txtimg))
        cv2.imwrite(os.path.join('./image', txtimg),jpg)
        h,w_,C=jpg.shape
        xml = txtimg.replace('jpg','xml')
        drawRealBox(xml, w_, h)
    file_ynh.close()
    

    2.解析自定义的label,生成pathname+多标签格式的txt:

    #-*-coding:utf-8-*-
    import os
    import cv2
    import random
    import shutil
    import numpy
    from PIL import Image
    
    out0 ='''<?xml version="1.0" encoding="utf-8"?>
    <annotation>
    	<folder>None</folder>
    	<filename>%(name)s</filename>
    	<source>
    		<database>None</database>
    		<annotation>None</annotation>
    		<image>None</image>
    		<flickrid>None</flickrid>
    	</source>
    	<owner>
    		<flickrid>None</flickrid>
    		<name>None</name>
    	</owner>
    	<segmented>0</segmented>
    	<size>
    		<width>%(width)d</width>
    		<height>%(height)d</height>
    		<depth>3</depth>
    	</size>
    '''
    out1 = '''	<object>
    		<name>%(class)s</name>
            <pose>Unspecified</pose>
    		<truncated>0</truncated>
    		<difficult>0</difficult>
    		<bndbox>
    			<xmin>%(xmin)d</xmin>
    			<ymin>%(ymin)d</ymin>
    			<xmax>%(xmax)d</xmax>
    			<ymax>%(ymax)d</ymax>
    		</bndbox>
    	</object>
    '''
    
    out2 = '''</annotation>
    '''
    
    def yololabel(allimg):
        def rect2rect(x1, y1, x2, y2, w, h):
            px = float(x1 + x2) / 2 / w
            py = float(y1 + y2) / 2 / h
            pw = float(x2 - x1 + 1) / w
            ph = float(y2 - y1 + 1) / h
    
            return [px, py, pw, ph]
    
        def getxywh(out):
            xs = []
            ys = []
            for line in out:
                points = line[0]
                xs.append(points[0])
                xs.append(points[2])
                ys.append(points[1])
                ys.append(points[3])
            x1 = min(xs)
            x2 = max(xs)
            y1 = min(ys)
            y2 = max(ys)
            w = x2-x1+1
            h = y2-y1+1
            return [x1, y1, w, h]
    
        def proberect(txt, w, h):
            f = open(txt)
            lines = f.readlines()
            out = [] #为了截取的信息
            boxout = [] #label信息
    
            fxml = txt.replace('标注文本', 'TargetDetection/PointXML')
            fxml = fxml.replace('.txt', '.xml')
            fxml = open(fxml, 'w')
    
            imgfile = txt.split('/')[-1]
            source = {}
            source['name'] = imgfile
            source['width'] = w
            source['height'] = h
    
            fxml.write(out0 % source)
    
            label = {}
            for box in lines:
                box = box.replace('\n', '')
                box = box.split()
                if box[0] != 'DP' and box[0] != 'JS':
                    print txt
                    raise RuntimeError('txt中分类标签不是DP也不是JS')
                label['class'] = box[0]
                # box = box[1].split(',')
                copybox = box
                box = box[1:5]
                if box[0] == 'NaN' or box[1] is 'NaN' or box[2] is 'NaN' or box[3] is 'NaN':
                    continue
                try:
                    x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
                except Exception as e:
                    print txt # 有一个标签没加空格,这里会报错,就会知道是哪一个了
                    exit()
    
                if int(box[0]) > int(box[2]) and int(box[1]) > int(box[3]):
                    label['xmin'] = x2
                    label['ymin'] = y2
                    label['xmax'] = x1
                    label['ymax'] = y1
                    box[0], box[1], box[2], box[3] = box[2], box[3], box[0], box[1]
                elif int(box[0]) > int(box[2]) or int(box[1]) > int(box[3]):
                    print txt
                    raise RuntimeError('box的大小不是左上右下,也不是右下左上')
                    exit()
                else:
                    label['xmin'] = x1
                    label['ymin'] = y1
                    label['xmax'] = x2
                    label['ymax'] = y2
                fxml.write(out1 % label)
                #out.append(rect2rect(x1, y1, x2, y2, w, h))
                out.append([[x1, y1, x2, y2],copybox[0]])
                boxout.append([copybox[0],copybox[5:]])
    
            fxml.write(out2)
            imgout = getxywh(out) # 为了计算每个图所有物体的占用区域
            return out,imgout,boxout
    
        def groberect(points, ww, hh):
            x1 = points[0]
            y1 = points[1]
            x2 = points[2]
            y2 = points[3]
    
            w = x2 - x1 + 1
            h = y2 - y1 + 1
            px = float(x1 + x2) / 2
            py = float(y1 + y2) / 2
    
            w = w * 1.2
            h = h * 1.2
    
            l = max(0, px - w / 2)
            r = min(ww - 1, px + w / 2)
            t = max(0, py - h / 2)
            b = min(hh - 1, py + h / 2)
    
            # x1y1 x2y2
            return [[int(l), int(t), int(r), int(b)], int(w), int(h)]
    
    
        global imgroot
        global pointroot
        path1 = imgroot.replace('红绿灯图片', 'TargetDetection')
        if not os.path.exists(path1):
            os.makedirs(path1)
    
        pathPointXML = path1 + '/' + 'PointXML'
        if not os.path.exists(pathPointXML):
            os.makedirs(pathPointXML)
    
        path2 = imgroot.replace('红绿灯图片', 'TargetClassification')
        if not os.path.exists(path2):
            os.makedirs(path2)
    
        pathDP = path2 + '/' + 'DP'
        if not os.path.exists(pathDP):
            os.makedirs(pathDP)
    
        pathJS = path2 + '/' + 'JS'
        if not os.path.exists(pathJS):
            os.makedirs(pathJS)
    
        pathFYB = path2 + '/' + 'FYB'
        if not os.path.exists(pathFYB):
            os.makedirs(pathFYB)
    
        path3 = imgroot.replace('红绿灯图片', 'TargetAnalysis')
        if not os.path.exists(path3):
            os.makedirs(path3)
    
        pathDP = path3 + '/' + 'DP'
        if not os.path.exists(pathDP):
            os.makedirs(pathDP)
    
        pathJS = path3 + '/' + 'JS'
        if not os.path.exists(pathJS):
            os.makedirs(pathJS)
    
        allimage = allimg.keys()
        lenall = len(allimage)
        num = 0
        for im in allimage:
            num += 1
            print num," / ",lenall
            if num ==1388:
                aa=1
    
            impath = os.path.join(imgroot, im + '.jpg')
            img = cv2.imread(impath)
            if img is None:
                continue
            h, w, _ = img.shape
            labfile = os.path.join(pointroot, im + '.txt')
            try:
                lines,imgout,boxouts = proberect(labfile, w, h)
            except:
                continue
            if len(lines) < 1:
                continue
    
            numi = 0
            num_i = 0
            for line in lines:
                boxout = boxouts[num_i]
                rect, ww, hh = groberect(line[0], w, h)
                newjpgname_1 = allimg[im]+ '_' + str(numi) + '.jpg'
                newjpgname_2 = allimg[im]+ '_' + str(numi+1) + '.jpg'
                h1 = 0 if (rect[1]-5)<0 else rect[1]-5
                h0 = 0 if (rect[0]-5)<0 else rect[0]-5
                h3 = h if (rect[3]+5)>h else rect[3]+5
                h2 = w if (rect[2]+5)>w else rect[2]+5
    
                h11 = 0 if (rect[1]-3)<0 else rect[1]-3
                h00 = 0 if (rect[0]-3)<0 else rect[0]-3
                h33 = h if (rect[3]+3)>h else rect[3]+3
                h22 = w if (rect[2]+3)>w else rect[2]+3
    
                newimg1 = img[h1:h3,h0:h2].copy()
                newimg2 = img[h11:h33,h00:h22].copy()
                #newimg = img[rect[1]:rect[3],rect[0]:rect[2]].copy()
                # 因为这是右闭区间
                '''
                newimg1 = img[rect[1]:rect[1]+hh, rect[0]:rect[0]+ww].copy() 
                ww = x2 - x1 + 1
                hh = y2 - y1 + 1
                '''
                if line[1] == 'DP':
                    Obclass1 = os.path.join('./TargetClassification/DP/', newjpgname_1)
                    cv2.imwrite(Obclass1, newimg1)
    
                    Obclass2 = os.path.join('./TargetClassification/DP/', newjpgname_2)
                    cv2.imwrite(Obclass2, newimg2)
                elif line[1] == 'JS':
                    Obclass1 = os.path.join('./TargetClassification/JS/', newjpgname_1)
                    cv2.imwrite(Obclass1, newimg1)
    
                    Obclass2 = os.path.join('./TargetClassification/JS/', newjpgname_2)
                    cv2.imwrite(Obclass2, newimg2)
                else:
                    print impath
                    raise RuntimeError('txt中分类标签不是DP也不是JS')
                    exit()
    
                newtxtname = Obclass1.replace('jpg','txt')
                txtpath = newtxtname.replace('TargetClassification','TargetAnalysis')
                labf = open(txtpath, 'w')
                labf.write('%s ' % (boxout[0]))
                str2list = " ".join(boxout[1])
                labf.write('%s\n' % str2list)
                labf.close()
    
                newtxtname = Obclass2.replace('jpg','txt')
                txtpath = newtxtname.replace('TargetClassification','TargetAnalysis')
                labf = open(txtpath, 'w')
                labf.write('%s ' % (boxout[0]))
                str2list = " ".join(boxout[1])
                labf.write('%s\n' % str2list)
                labf.close()
    
                while 1:
                    x = random.randint(1, w)
                    y = random.randint(1, h)
                    if( x > imgout[0] and x < (imgout[0]+imgout[2]-1) and y > imgout[1] and x < (imgout[1]+imgout[3]-1) ):
                        continue
                    cropImg = img[(y):(y + hh), (x):(x + ww)]
                    newjpgname = allimg[im] + '_' + str(num_i)+ '_X' + '.jpg'
                    cv2.imwrite(os.path.join('./TargetClassification/FYB/', newjpgname), cropImg)
                    break
                numi += 2
                num_i += 1
    
    def listallfile_form_map(trainf, testf):
        train = {}
        test = {}
        trf = open(trainf, 'r')
        lines = trf.readlines()
        for li in lines:
            li = li.replace('\n','')
            key,val = li.split(':')
            train[key] = val
        tef = open(testf, 'r')
        lines = tef.readlines()
        for li in lines:
            li = li.replace('\n','')
            key,val = li.split(':')
            test[key] = val
        return train, test
    
    def saveremap(f, remap):
        keys = remap.keys()
        for k in keys:
            f.write('%s:%s\n' % (k, remap[k]))
    
    
    def listallfile(img, lab):
        global rename
        num = 0
        train = {}
        test = {}
        laball = os.listdir(lab)
        imgall = os.listdir(img)
        for lab in laball:
            if not lab.replace('txt', 'jpg') in imgall:
                continue
            newname = rename.format('%08d' % num)
            if random.randint(0, 10) == 1:
                test[lab.split('.')[0]] = newname
            else:
                train[lab.split('.')[0]] = newname
            num += 1
        return train, test
    
    
    ###########  main  #################
    pointroot = './标注文本'
    imgroot = './红绿灯图片'
    rename = 'newTrain_2019_06_26_{}'
    
    if os.path.exists('./train_remap.txt') and os.path.exists('./test_remap.txt'):
        train, test = listallfile_form_map('./train_remap.txt', './test_remap.txt')
    else:
        train, test = listallfile(imgroot, pointroot)
    
    f = open('./train_remap.txt', 'w')
    saveremap(f, train)
    f = open('./test_remap.txt', 'w')
    saveremap(f, test)
    
    yololabel(train)
    yololabel(test)
    ###########  main  #################
    
    import os
    import cv2
    #import ranmdom
    import random
    
    all_labels = os.listdir("./TargetAnalysis/DP")
    all_images = os.listdir("./TargetClassification/DP")
    
    light = ["UPP","OFF"]
    color = ["RD","YL","GR","BL"]
    typee = ["WZ","YP","LT","RT","UP","XX"]
    
    name = "2019-06-28-"
    num=0
    
    
    def is_makedirs(dirs):
    	if not os.path.exists(dirs):
    		os.makedirs(dirs)
    
    is_makedirs("./txt")
    trainf = open("./txt/dp_train.txt", "w")
    testf = open("./txt/dp_test.txt", "w")
    
    
    
    cunimage_save = "./save_image"
    is_makedirs(cunimage_save)
    
    for image in all_images:
        label = image.replace(".jpg",".txt")
        label = os.path.join("./TargetAnalysis/DP", label)
        img = cv2.imread("./TargetClassification/DP" + "/" + image)
        print(image)
        try:
    		img.shape 
        except:
    		print('fail to read xxx.jpg')
    		continue
        labf = open(label,"r")
        labs = labf.readlines()
        for lab in labs:
    		labout = []
    		lab = lab.replace("\n","")
    		print(lab)
    		lab = lab.split(" ")
    		print(lab[0])
    		if (str(lab[0]).__str__()) in "JS":
    			continue
    		else:
    			lab = lab[1:]
    			lablight = [0,0,0]
    			print(lab[0])
    			lablight[0] = light.index(lab[0])
    			lablight[1] = light.index(lab[1])
    			lablight[2] = light.index(lab[2])
    			labout = labout+lablight
    			
    			labcolor = [0,0,0,0]
    			light_color_1 = labcolor
    			light_color_1[color.index(lab[3])] = 1
    			labout = labout+light_color_1
    			
    			light_color_2 = labcolor
    			light_color_2[color.index(lab[4])] = 1
    			labout = labout+light_color_2
    			
    			light_color_3 = labcolor
    			light_color_3[color.index(lab[5])] = 1
    			labout = labout+light_color_3
    			
    			labtype = [0,0,0,0,0,0]
    			light_type_1 = labtype
    			light_type_1[typee.index(lab[6])] = 1
    			labout = labout+light_type_1
    			
    			light_type_2 = labtype
    			light_type_2[typee.index(lab[7])] = 1
    			labout = labout+light_type_2
    			
    			light_type_3 = labtype
    			light_type_3[typee.index(lab[8])] = 1
    			labout = labout+light_type_3
    			num=num+1
    			print(labout)
    			jpgname = name+"{}.jpg".format("%08d" % num)
    			cv2.imwrite(os.path.join(cunimage_save, jpgname),img)
    			if random.randint(1,10) == 1:
    				f = testf
    			else:
    				f = trainf
    			f.write("./data/image/%s" % jpgname)
    			for i in labout:
    				f.write(" %d" % i)
    			f.write("\n")
    trainf.close()
    testf.close()
    
    
    
    all_labels = os.listdir("./TargetAnalysis/JS")
    all_images = os.listdir("./TargetClassification/JS")
    
    trainf = open("./txt/js_train.txt", "w")
    testf = open("./txt/js_test.txt", "w")
    
    for image in all_images:
        label = image.replace(".jpg",".txt")
        label = os.path.join("./TargetAnalysis/JS", label)
        img = cv2.imread("./TargetClassification/JS" + "/" + image)
        print(image)
        try:
    		img.shape 
        except:
    		print('fail to read xxx.jpg')
    		continue
        labf = open(label,"r")
        labs = labf.readlines()
        for lab in labs:
    		labout = []
    		lab = lab.replace("\n","")
    		print(lab)
    		lab = lab.split(" ")
    		print(lab[0])
    		if (str(lab[0]).__str__()) in "JS":
    			lab = lab[1:]
    			lablight = [0]
    			print(lab[0])
    			lablight[0] = light.index(lab[0])
    			labout = labout+lablight
    			num=num+1
    			print(labout)
    			jpgname = name+"{}.jpg".format("%08d" % num)
    			cv2.imwrite(os.path.join(cunimage_save, jpgname),img)
    			if random.randint(1,10) == 1:
    				f = testf
    			else:
    				f = trainf
    			f.write("./data/image/%s" % jpgname)
    			for i in labout:
    				f.write(" %d" % i)
    			f.write("\n")
    trainf.close()
    testf.close()
    
    all_images = os.listdir("./TargetClassification/FYB")
    
    trainf = open("./txt/fyb_train.txt", "w")
    testf = open("./txt/fyb_test.txt", "w")
    
    for image in all_images:
    	img = cv2.imread("./TargetClassification/FYB" + "/" + image)
    	print(image)
    	try:
    		img.shape 
    	except:
    		print('fail to read xxx.jpg')
    		continue
    
    	num=num+1
    	jpgname = name+"{}.jpg".format("%08d" % num)
    	if random.randint(1,10) == 1:
    		f = testf
    	else:
    		f = trainf
    	f.write("./data/image/%s" % jpgname)
    	f.write("\n")
    trainf.close()
    testf.close()
    
    
    
     #-*-coding:utf-8-*-
    import os
    import cv2
    import random
    import shutil
    import numpy
    from PIL import Image
    
    
    file1 = open('./txt/dp_train.txt','r')
    file2 = open('./txt/dp_test.txt','r')
    
    file3 = open('./txt/js_train.txt','r')
    file4 = open('./txt/js_test.txt','r')
    
    file5 = open('./txt/fyb_train.txt','r')
    file6 = open('./txt/fyb_test.txt','r')
    
    label_1 = file1.readlines()
    label_2 = file2.readlines()
    
    label_3 = file3.readlines()
    label_4 = file4.readlines()
    
    label_5 = file5.readlines()
    label_6 = file6.readlines()
    
    
    Vname = open('./txt/train_srn_1.txt','w')#test_srn
    
    for lab in label_1:
        lab = lab.replace('\n', '')
        imgpath = lab.split(' ')[0]
        imgname = imgpath.split('/')[-1]
        labInfo = [int(i) for i in lab.split(' ')[1:]]
        #灯盘 计时器 背景 计时器亮 计时器灭 灯盘灯1 灯盘灯2 灯盘灯3 灯盘灯1颜色红 灯盘灯1颜色黄 灯盘灯1颜色绿 灯盘灯1颜色黑 3+2+3+4
        #灯盘灯2颜色红 灯盘灯2颜色黄 灯盘灯2颜色绿 灯盘灯2颜色黑 灯盘灯3颜色红 灯盘灯3颜色黄 灯盘灯3颜色绿 灯盘灯3颜色黑 4+4
        #灯盘灯1未知(黑) 灯盘灯1上 灯盘灯1左 灯盘灯1右 灯盘灯1圆盘 灯盘灯1数字 6
        #灯盘灯2未知(黑) 灯盘灯2上 灯盘灯2左 灯盘灯2右 灯盘灯2圆盘 灯盘灯2数字 6
        #灯盘灯3未知(黑) 灯盘灯3上 灯盘灯3左 灯盘灯3右 灯盘灯3圆盘 灯盘灯3数字 6
        Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,1,0,0,\
                                            255,255,\
                                                labInfo[0],labInfo[1],labInfo[2],\
                                                    labInfo[3],labInfo[4],labInfo[5],labInfo[6],\
                                                        labInfo[7],labInfo[8],labInfo[9],labInfo[10],\
                                                            labInfo[11],labInfo[12],labInfo[13],labInfo[14],\
                                                                labInfo[15],labInfo[16],labInfo[17],labInfo[18],labInfo[19],labInfo[20],\
                                                                    labInfo[21],labInfo[22],labInfo[23],labInfo[24],labInfo[25],labInfo[26],\
                                                                        labInfo[27],labInfo[28],labInfo[29],labInfo[30],labInfo[31],labInfo[32]))
    
    
    for lab in label_3:
        lab = lab.replace('\n', '')
        imgpath = lab.split(' ')[0]
        imgname = imgpath.split('/')[-1]
        labInfo = [int(i) for i in lab.split(' ')[1:]]
        #灯盘 计时器 背景 计时器亮 计时器灭 灯盘灯1 灯盘灯2 灯盘灯3 灯盘灯1颜色红 灯盘灯1颜色黄 灯盘灯1颜色绿 灯盘灯1颜色黑 3+2+3+4
        #灯盘灯2颜色红 灯盘灯2颜色黄 灯盘灯2颜色绿 灯盘灯2颜色黑 灯盘灯3颜色红 灯盘灯3颜色黄 灯盘灯3颜色绿 灯盘灯3颜色黑 4+4
        #灯盘灯1未知(黑) 灯盘灯1上 灯盘灯1左 灯盘灯1右 灯盘灯1圆盘 灯盘灯1数字 6
        #灯盘灯2未知(黑) 灯盘灯2上 灯盘灯2左 灯盘灯2右 灯盘灯2圆盘 灯盘灯2数字 6
        #灯盘灯3未知(黑) 灯盘灯3上 灯盘灯3左 灯盘灯3右 灯盘灯3圆盘 灯盘灯3数字 6
        if(labInfo[0]==1):
            Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,0,1,0,\
                                            1,0,\
                                                255,255,255,\
                                                    255,255,255,255,\
                                                        255,255,255,255,\
                                                            255,255,255,255,\
                                                                255,255,255,255,255,255,\
                                                                    255,255,255,255,255,255,\
                                                                        255,255,255,255,255,255))
        else:
            Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,0,1,0,\
                                            0,1,\
                                                255,255,255,\
                                                    255,255,255,255,\
                                                        255,255,255,255,\
                                                            255,255,255,255,\
                                                                255,255,255,255,255,255,\
                                                                    255,255,255,255,255,255,\
                                                                        255,255,255,255,255,255))
    
    for lab in label_5:
        lab = lab.replace('\n', '')
        imgname = lab.split('/')[-1]
        #灯盘 计时器 背景 计时器亮 计时器灭 灯盘灯1 灯盘灯2 灯盘灯3 灯盘灯1颜色红 灯盘灯1颜色黄 灯盘灯1颜色绿 灯盘灯1颜色黑 3+2+3+4
        #灯盘灯2颜色红 灯盘灯2颜色黄 灯盘灯2颜色绿 灯盘灯2颜色黑 灯盘灯3颜色红 灯盘灯3颜色黄 灯盘灯3颜色绿 灯盘灯3颜色黑 4+4
        #灯盘灯1未知(黑) 灯盘灯1上 灯盘灯1左 灯盘灯1右 灯盘灯1圆盘 灯盘灯1数字 6
        #灯盘灯2未知(黑) 灯盘灯2上 灯盘灯2左 灯盘灯2右 灯盘灯2圆盘 灯盘灯2数字 6
        #灯盘灯3未知(黑) 灯盘灯3上 灯盘灯3左 灯盘灯3右 灯盘灯3圆盘 灯盘灯3数字 6
        Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,0,0,1,\
                                        255,255,\
                                            255,255,255,\
                                                255,255,255,255,\
                                                    255,255,255,255,\
                                                        255,255,255,255,\
                                                            255,255,255,255,255,255,\
                                                                255,255,255,255,255,255,\
                                                                    255,255,255,255,255,255))
                                                                        
    Vname.close()
    
    
    
    Vname = open('./txt/test_srn_1.txt','w')
    for lab in label_2:
        lab = lab.replace('\n', '')
        imgpath = lab.split(' ')[0]
        imgname = imgpath.split('/')[-1]
        labInfo = [int(i) for i in lab.split(' ')[1:]]
        #灯盘 计时器 背景 计时器亮 计时器灭 灯盘灯1 灯盘灯2 灯盘灯3 灯盘灯1颜色红 灯盘灯1颜色黄 灯盘灯1颜色绿 灯盘灯1颜色黑 3+2+3+4
        #灯盘灯2颜色红 灯盘灯2颜色黄 灯盘灯2颜色绿 灯盘灯2颜色黑 灯盘灯3颜色红 灯盘灯3颜色黄 灯盘灯3颜色绿 灯盘灯3颜色黑 4+4
        #灯盘灯1未知(黑) 灯盘灯1上 灯盘灯1左 灯盘灯1右 灯盘灯1圆盘 灯盘灯1数字 6
        #灯盘灯2未知(黑) 灯盘灯2上 灯盘灯2左 灯盘灯2右 灯盘灯2圆盘 灯盘灯2数字 6
        #灯盘灯3未知(黑) 灯盘灯3上 灯盘灯3左 灯盘灯3右 灯盘灯3圆盘 灯盘灯3数字 6
        Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,1,0,0,\
                                            255,255,\
                                                labInfo[0],labInfo[1],labInfo[2],\
                                                    labInfo[3],labInfo[4],labInfo[5],labInfo[6],\
                                                        labInfo[7],labInfo[8],labInfo[9],labInfo[10],\
                                                            labInfo[11],labInfo[12],labInfo[13],labInfo[14],\
                                                                labInfo[15],labInfo[16],labInfo[17],labInfo[18],labInfo[19],labInfo[20],\
                                                                    labInfo[21],labInfo[22],labInfo[23],labInfo[24],labInfo[25],labInfo[26],\
                                                                        labInfo[27],labInfo[28],labInfo[29],labInfo[30],labInfo[31],labInfo[32]))
    
    
    for lab in label_4:
        lab = lab.replace('\n', '')
        imgpath = lab.split(' ')[0]
        imgname = imgpath.split('/')[-1]
        labInfo = [int(i) for i in lab.split(' ')[1:]]
        #灯盘 计时器 背景 计时器亮 计时器灭 灯盘灯1 灯盘灯2 灯盘灯3 灯盘灯1颜色红 灯盘灯1颜色黄 灯盘灯1颜色绿 灯盘灯1颜色黑 3+2+3+4
        #灯盘灯2颜色红 灯盘灯2颜色黄 灯盘灯2颜色绿 灯盘灯2颜色黑 灯盘灯3颜色红 灯盘灯3颜色黄 灯盘灯3颜色绿 灯盘灯3颜色黑 4+4
        #灯盘灯1未知(黑) 灯盘灯1上 灯盘灯1左 灯盘灯1右 灯盘灯1圆盘 灯盘灯1数字 6
        #灯盘灯2未知(黑) 灯盘灯2上 灯盘灯2左 灯盘灯2右 灯盘灯2圆盘 灯盘灯2数字 6
        #灯盘灯3未知(黑) 灯盘灯3上 灯盘灯3左 灯盘灯3右 灯盘灯3圆盘 灯盘灯3数字 6
        if(labInfo[0]==1):
            Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,0,1,0,\
                                            1,0,\
                                                255,255,255,\
                                                    255,255,255,255,\
                                                        255,255,255,255,\
                                                            255,255,255,255,\
                                                                255,255,255,255,255,255,\
                                                                    255,255,255,255,255,255,\
                                                                        255,255,255,255,255,255))
        else:
            Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,0,1,0,\
                                            0,1,\
                                                255,255,255,\
                                                    255,255,255,255,\
                                                        255,255,255,255,\
                                                            255,255,255,255,\
                                                                255,255,255,255,255,255,\
                                                                    255,255,255,255,255,255,\
                                                                        255,255,255,255,255,255))
    
    for lab in label_6:
        lab = lab.replace('\n', '')
        imgname = lab.split('/')[-1]
        #灯盘 计时器 背景 计时器亮 计时器灭 灯盘灯1 灯盘灯2 灯盘灯3 灯盘灯1颜色红 灯盘灯1颜色黄 灯盘灯1颜色绿 灯盘灯1颜色黑 3+2+3+4
        #灯盘灯2颜色红 灯盘灯2颜色黄 灯盘灯2颜色绿 灯盘灯2颜色黑 灯盘灯3颜色红 灯盘灯3颜色黄 灯盘灯3颜色绿 灯盘灯3颜色黑 4+4
        #灯盘灯1未知(黑) 灯盘灯1上 灯盘灯1左 灯盘灯1右 灯盘灯1圆盘 灯盘灯1数字 6
        #灯盘灯2未知(黑) 灯盘灯2上 灯盘灯2左 灯盘灯2右 灯盘灯2圆盘 灯盘灯2数字 6
        #灯盘灯3未知(黑) 灯盘灯3上 灯盘灯3左 灯盘灯3右 灯盘灯3圆盘 灯盘灯3数字 6
        Vname.write('./data/Red_Yellow_Green/images/%s %d %d %d \
    %d %d \
    %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d \
    %d %d %d %d %d %d\n' % (imgname,0,0,1,\
                                        255,255,\
                                            255,255,255,\
                                                255,255,255,255,\
                                                    255,255,255,255,\
                                                        255,255,255,255,\
                                                            255,255,255,255,255,255,\
                                                                255,255,255,255,255,255,\
                                                                    255,255,255,255,255,255))
                                                                        
    Vname.close()
    
    file1.close()
    file2.close()
    file3.close()
    file4.close()
    file5.close()
    file6.close()
    
    
    train_file = open('./txt/train_srn_1.txt','r')#test_srn
    test_file = open('./txt/test_srn_1.txt','r')
    
    shuffle_f_train = open('./txt/train_srn.txt','w')#test_srn
    shuffle_f_test = open('./txt/test_srn.txt','w')
    
    import random
    train_lines = train_file.readlines()
    test_lines = test_file.readlines()
    
    random.shuffle(train_lines)
    random.shuffle(test_lines)
    
    shuffle_f_train.writelines(train_lines)
    shuffle_f_test.writelines(test_lines)
    
    
    shuffle_f_train.close()
    shuffle_f_test.close()
    
    train_file.close()
    test_file.close()

     

    展开全文
  • 2、插入shell脚本 需要设置环境变量,E:\Anaconda3;E:\Anaconda3\Scripts;E:\Anaconda3\Library\bin 需要添加.py文件默认打开方式为python.exe 3、新建转换 首先设置变量以便于动态读取时间戳和动态读取excel...

    1、新建job
    2、插入shell脚本

    	需要设置环境变量,E:\Anaconda3;E:\Anaconda3\Scripts;E:\Anaconda3\Library\bin
    	需要添加.py文件默认打开方式为python.exe
    


    3、新建转换

    	首先设置变量以便于动态读取时间戳和动态读取excel文件目录;
    	在执行sql脚本中需要勾选执行每一行,变量替换,绑定参数?;
    	在excel输入中选中的文件的文件/目录下使用${变量},即可取到excel文件名。
    

    在这里插入图片描述
    4、在数据库中建表,执行所有步骤即可。

    	create TABLESPACE ts_flow_data LOGGING datafile 'E:\ORACLE\T_FLOW_DATA_01.DBF' SIZE 128M 
    	AUTOEXTEND ON NEXT 128M MAXSIZE 20G;
    	COMMIT;
    
    	CREATE TABLE t_flow_data ( fid NUMBER, fcleandate TIMESTAMP(6), fbizdate TIMESTAMP(6), 
    	fcoustomername  NVARCHAR2(255), fmaterialname NVARCHAR2(255), fmodel NVARCHAR2(255), 
    	fqty NUMBER(28, 16), fprice NUMBER(19, 4), famount NUMBER(19, 4), fshengchanqiye NVARCHAR2(255), 
    	flot NVARCHAR2(80), fmfg TIMESTAMP(6), fexp TIMESTAMP(6), fsupplier NVARCHAR2(255))
    	TABLESPACE ts_flow_data;
    	COMMIT;
    
    	CREATE SEQUENCE seq_flow_data MINVALUE 1 NOMAXVALUE START WITH 1 
    	INCREMENT BY 1 CACHE 100 ORDER;
    	COMMIT;
    
    	CREATE TRIGGER trg_flow_data
    	BEFORE INSERT ON t_flow_data
    	FOR EACH ROW
    		BEGIN
    		    SELECT
    		        seq_flow_data.NEXTVAL
    		    INTO :new.fid
    		    FROM
    		        dual;
    		END;
    	COMMIT;
    
    展开全文
  • import sys import datetime for line in sys.stdin: line = line.strip() userid, movieid, rating, unixtime = line.split('\t') ... weekday = datetime.datetime.fromtimestamp(float(unixtime)).i...
    import sys
    import datetime
    
    for line in sys.stdin:
      line = line.strip()
      userid, movieid, rating, unixtime = line.split('\t')
      weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
      print '\t'.join([userid, movieid, rating, str(weekday)])
    

    Use the mapper script:

    CREATE TABLE u_data_new (
      userid INT,
      movieid INT,
      rating INT,
      weekday INT)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t';
    
    add FILE weekday_mapper.py;
    
    INSERT OVERWRITE TABLE u_data_new
    SELECT
      TRANSFORM (userid, movieid, rating, unixtime)
      USING 'python weekday_mapper.py'
      AS (userid, movieid, rating, weekday)
    FROM u_data;
    
    SELECT weekday, COUNT(*)
    FROM u_data_new
    GROUP BY weekday;




    1. FROM (  
    2. MAP doctext USING 'python wc_mapper.py' AS (word, cnt)  
    3. FROM docs  
    4. CLUSTER BY word  
    5. ) a  
    6. REDUCE word, cnt USING 'python wc_reduce.py';  

    转载于:https://www.cnblogs.com/charlie-badegg/p/3947065.html

    展开全文
  • shell脚本数据清洗

    2019-04-16 16:09:36
    本文通过上一节Hadoop离线项目之数据清洗开发的数据清洗jar包,对日志文件进行清洗,并把清洗后的结果移动到hive表里,并刷新元数据信息。并把这个过程写到shell脚本里。 jar包路径: /home/hadoop/app/hadoop-2.6.0...

    本文通过上一节Hadoop离线项目之数据清洗开发的数据清洗jar包,对日志文件进行清洗,并把清洗后的结果移动到hive表里,并刷新元数据信息。并把这个过程写到shell脚本里。
    jar包路径:

    /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/lib/g6-hadoop-1.0.jar
    

    jar包主程序:

    com.ruozedata.hadoop.mapreduce.driver.LogETLDriver
    

    hdfs上日志文件路径:

    /g6/hadoop/accesslog/
    

    hdfs上日志文件清洗后要放的路径:

    /g6/hadoop/access/output/
    

    hive外部表g6_access指定的路径:

    /g6/hadoop/access/clear/
    

    shell脚本代码:

    #!/bin/bash
    if [ $# != 1 ] ; then
    echo "USAGE: g6-train-hadoop.sh <dateString>"
    echo " e.g.: g6-train-hadoop.sh 20180717"
    exit 1;
    fi
    
    process_date=$1
    
    echo "--------------------step1: mapreduce etl--------------------"
    hadoop jar /home/hadoop/app/hadoop-2.6.0-cdh5.7.0/lib/g6-hadoop-1.0.jar com.ruozedata.hadoop.mapreduce.driver.LogETLDriver /g6/hadoop/accesslog/${process_date}.log /g6/hadoop/access/output/day=${process_date}
    
    echo "--------------------step2:mv data to DW--------------------"
    hdfs dfs -rm -r /g6/hadoop/access/clear/day=${process_date}
    hdfs dfs -mv  /g6/hadoop/access/output/day=${process_date} /g6/hadoop/access/clear/
    
    echo "--------------------step4:flush meatadata--------------------"
    hive -e "use g6_hadoop; alter table g6_access add if not exists partition(day='${process_date}');"
    

    然后运行shell脚本:

    [hadoop@10-9-140-90 shell]$ ./g6-train-hadoop.sh 
    USAGE: g6-train-hadoop.sh <dateString>
     e.g.: g6-train-hadoop.sh 20180717
    [hadoop@10-9-140-90 shell]$ ./g6-train-hadoop.sh 20180717
    ......此处省略
    

    运行成功后,去hive里查看一下:

    hive (g6_hadoop)> select * from g6_access where day=20180717 limit 1;
    OK
    cdn     region  level   time    ip      domain  url     traffic day
    baidu   CN      E       20180717042142  156.89.48.178   v2.go2yd.com    http://v1.go2yd.com/user_upload/1531633977627104fdecdc68fe7a2c4b96b2226fd3f4c.mp4_bd.mp4       62109   20180717
    Time taken: 0.42 seconds, Fetched: 1 row(s)
    hive (g6_hadoop)> 
    
    展开全文
  • Shell脚本 数据清洗

    2017-11-01 11:30:00
    需要做的任务是将上图类似的格式的文件进行处理,将年月日小时分别提取出来放到每行的行尾(上图已清洗好) 自己的思路是先用cut命令将每行的年月日小时提取出来,分别给一个变量,然后再循环利用sed命令将年月日...
  • 此次数据是用来导入HIVE表中,但是由于数据不规范,需要做清洗, 下面的数据不仅有空出一行,并且每个需要录入的字段都有空格。 | 1 | 2 | 2016-06-03 | | 1 | 3 | 2016-06-08 | | 2 | 3 | 2016-06-08 | | 3 | 4 ...
  • 本实例通过python脚本对电影数据进行清洗,帮助读者了解hive调用python脚本的整个流程。 操作步骤: 1、创建基表 CREATE TABLE u_data ( userid INT, //用户ID movieid INT, //电影ID rating INT, //电影...
  • Kettle7中使用Java脚本进行数据清洗

    千次阅读 2019-03-29 17:13:14
    1.首先,Kettle7.1下载li链接:https://sourceforge.net/projects/pentaho/files/Data%20Integration/7.1/pdi-ce-7.1.0.0-12.zip/download 2.打开这个文件:Spoon.bat 3.假设就这三个步骤: ...
  • python之数据清洗脚本

    2018-11-09 19:44:24
    #coding=utf-8 import numpy as np import pandas as pd na_list=['NO CLUE','N/A...data=pd.read_csv('311-service-requests.csv',na_values=na_list,dtype={'Incident Zip':str})#列值数据类型设置为字符串 #prin...
  • 使用脚本清洗 AS (userid, movieid, rating, weekday) -- 输出值(子表) FROM ml_100k;     然后我就失败了   转载于:https://www.cnblogs.com/wqbin/p/10363135.html
  • APP_JAR=$HADOOP_HOME/share/hadoop/mapreduce/dataclean-1.0-SNAPSHOT.jar INPUT=/input/oldlog1 OUTPUT=/output hadoop fs -rm -r $OUTPUT PROG=com.etc.RunJob hadoop jar $APP_JAR $PROG $INPUT $OUTPUT ~
  • bash脚本实例-linux性能数据清洗-1

    千次阅读 2016-07-25 18:08:47
    性能采集脚本将用uptime、vmstat、free、iostat命令采集到的数据记录到了一个日志文件中。本脚本就是将按照不同的标签把数据提取出来,加上标题栏保存为csv文件,方便进一步的分析。 性能数据记录文件。文件的第一...

空空如也

空空如也

1 2 3 4 5 ... 17
收藏数 335
精华内容 134
关键字:

脚本清洗数据