数据收集与处理（爬虫技术）

本文介绍: 介绍几种常见的数据收集、存储、组织以及分析的方法和工具首先介绍如何构造自己的网络爬虫从网上抓取内容，并将其中按照一定结构组织的信息抽取出来然后介绍如何使用Elast icSe arc h来有效地存储、组织和查询非结构化数据最后简要介绍和使用Spark对大规模的非结构化数据进行初步分析的方法。

# encoding: utf-8
import scrapy
import re
from scrapy.selector import Selector
from stock163.items import Stock163Item
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class ExampleSpider(CrawlSpider):
    name = "stocknews" #爬虫的名字为 "stocknews"
    allowed_domains = ["money.163.com"]#设置允许爬取的域名

    def __init__(self, id='600000', page='0', *args, **kwargs):#初始化方法，设置了一些初始参数，包括 id（默认为 '600000'）、page（默认为 '0'），以及其他可能传递的参数。
        # allowrule = "/%s/%sd+/d+/*" % (year, month)
        # allowrule = "/%s/%s%s/d+/*" % (year, month, day) #这个规则匹配类似 "/2022/11/25/" 这样的日期结构
        allowrule = r"/d+/d+/d+/*"# 定义了一个正则表达式，用于匹配新闻链接的规则。数字 数字 数字 任意字符
        self.counter = 0 # 初始化一个计数器，可能用于跟踪爬取的新闻数量。
        self.stock_id = id  # 保存股票ID
        self.start_urls = ['http://quotes.money.163.com/f10/gsxw_%s,%s.html' % (id, page)] # 设置初始爬取的URL，这里使用了 id 和 page 参数构造URL。
        ExampleSpider.rules = (Rule(LinkExtractor(allow=allowrule), callback="parse_news", follow=False),)
        # 定义了爬取规则。这里使用了 LinkExtractor 来提取链接，通过正则表达式 allow=allowrule 匹配链接规则，然后指定了回调函数为 parse_news
        # 最后设置 follow=False 表示不跟踪从当前链接提取的链接。
        # recompile the rule
        super(ExampleSpider, self).__init__(*args, **kwargs)
        # 调用父类（CrawlSpider）的初始化方法，确保爬虫的正确初始化。

    '''
    rules=Rule(LinkExtractor(allow=r"/d+/d+/d+/*"),
               callback="parse_news", follow=True
    )
    '''

    # f = open("out.txt", "w")

    def printcn(suni):
        for i in suni:
            print(suni.encode('utf-8'))

    def parse_news(self, response):
        item = Stock163Item()
        item['news_thread'] = response.url.strip().split('/')[-1][:-5]
        #这行代码从响应的URL中提取新闻线程信息。它首先通过response.url获取当前页面的URL，然后使用strip()方法去除首尾的空格，接着使用split('/')方法根据斜杠切割URL为一个列表，最后通过[-1]
        #取列表的最后一个元素，即URL中最后一个斜杠后的部分。[: -5] 是为了去掉文件扩展名（假设是.html或类似的扩展名），剩下的部分就是新闻线程的信息，然后将其赋值给item对象的news_thread属性。
        self.get_thread(response,item)
        self.get_title(response, item)
        self.get_source(response, item)
        self.get_url(response, item)
        self.get_news_from(response, item)
        self.get_from_url(response, item)
        self.get_text(response, item)

        return item  ##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse

    def get_title(self, response, item):
        title = response.xpath("/html/head/title/text()").extract()
        if title:
            # print ('title:'+title[0][:-5].encode('utf-8'))
            item['news_title'] = title[0][:-5]

    def get_source(self, response, item):
        source = response.xpath("//div[@class='left']/text()").extract()
        if source:
            # print ('source'+source[0][:-5].encode('utf-8'))
            item['news_time'] = source[0][:-5]

    def get_news_from(self, response, item):
        news_from = response.xpath("//div[@class='left']/a/text()").extract()
        if news_from:
            # print 'from'+news_from[0].encode('utf-8')
            item['news_from'] = news_from[0]

    def get_from_url(self, response, item):
        from_url = response.xpath("//div[@class='left']/a/@href").extract()
        if from_url:
            # print ('url'+from_url[0].encode('utf-8')        )
            item['from_url'] = from_url[0]

    def get_text(self, response, item):
        news_body = response.xpath("//div[@id='endText']/p/text()").extract()
        if news_body:
            # for  entry in news_body:
            #     print (entry.encode('utf-8'))
            item['news_body'] = news_body

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            print(news_url)
        item['news_url'] = news_url

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
#encoding: utf-8
import os
def ParseFilePath(url, id):
    # user should change this folder path
    outfolder = "e:\data\FinTech\News\Stocks\%s" % id
    components = url.split("/")
    year = components[3]
    monthday=components[4]
    month = monthday[:2]
    day = monthday[2:]
    idx=components[5]
    page=idx+"_"+components[6]
    #folder = outfolder + "\%s_%s_%s_" % (year, month, day)
    folder = outfolder
    if ((year=='') | ('keywords' in page)):
       filepath='xxx'
    else:
       filepath = folder + "\%s_%s_%s_%s.txt" % (year, month, day, page) 
    filepath=filepath.replace('?', '_')
    return(folder, filepath)

class Stock163Pipeline(object):   
    def process_item(self, item, spider):
        if spider.name != "stocknews":  return item
        if item.get("news_thread", None) is None: return item
                
        url = item['news_url']
        if 'keywords' in url:
           return item
        folder, filepath = ParseFilePath(url, spider.stock_id)
        spider.counter = spider.counter+1
        counterfilepath = folder+"\counter.txt"
        #one a single machine will is virtually no risk of race-condition
        if not os.path.exists(folder):
           os.makedirs(folder)        
        #print(filepath, counterfilepath)
        #print(spider.stats)
        fo = open(counterfilepath, "w", encoding="UTF-8")
        fo.write(str(spider.counter))
        fo.close()

        if (filepath!='xxx'):
           fo = open(filepath, 'w', encoding='utf-8')
           fo.write(str(dict(item)))
           fo.close()
        return None

`class ExampleSpider(CrawlSpider):
    name = "stocknews"
    

    def __init__(self, id='600000', page='0', *args, **kwargs):        
        #allowrule = "/%s/%sd+/d+/*" % (year, month)
        allowrule = "/%s/%s%s/d+/*" % (year, month, day)         
        self.counter = 0
        self.stock_id = id
        self.start_urls = ['http://%s' % (site)]
        ExampleSpider.rules=(Rule(LinkExtractor(allow=allowrule), callback="parse_news", follow=False),)
        #recompile the rule        `

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

for site in ['money.163.com', 'tech.163.com', 'money.163.com/stock']: 
	process.crawl('myspider', site = site)
process.start()

from keras.applications.vgg16 import VGG16
from keras.layers import Input,Flatten,Dense,Dropout
from keras.models import Model
from keras.optimizers import SGD

from keras.datasets import mnist

import cv2
import h5py as h5py
import numpy as np

model_vgg = VGG16(include_top=False,weights='imagenet',input_shape=(ishape,ishape,3))
model = Flatten(name='flatten')(model_vgg.output)
model = Dense(4096,activation='relu',name='fc1')(model)
model = Dense(4096,activation='relu',name='fc2')(model)
model = Dropout(0.5)(model)
model = Dense(10,activation='softmax')(model)
model_vgg_mnist = Model(model_vgg.input,model,name='vgg16')

model_vgg_mnist.summary()

model_vgg = VGG16(include_top=False,weights='imagenet',input_shape=(224,224,3))
for layer in model_vgg.layers:
    layer.trainable=False
model = Flatten()(model_vgg.output)
model = Dense(4096,activation='relu',name='fc1')(model)
model = Dense(4096,activation='relu',name='fc2')(model)
model = Dropout(0.5)(model)
model = Dense(10,activation='softmax',name='prediction')(model)
model_vgg_mnist_pretrain = Model(model_vgg.input,model,name='vgg16_pretrain')

model_vgg_mnist_pretrain.summary()

sgd = SGD(lr = 0.05,decay=1e-5)
model_vgg_mnist_pretrain.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])

(x_train,y_train),(x_test,y_test) = mnist.load_data()
x_train = [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_train]
x_train = np.concatenate([arr[np.newaxis] for arr in x_train]).astype('float32')
x_test = [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_test]
x_test = np.concatenate([arr[np.newaxis] for arr in x_test]).astype('float32')

x_test.shape
x_train.shape

x_train /= 255
x_test /= 255

np.where(x_train[0]!=0)

def tran_y(y):
    y_ohe = np.zeros(10)
    y_ohe[y] = 1
    return y_ohe

y_train_ohe = np.array([tran_y(y_train[i]) for i in range(len(y_train))])
y_test_ohe = np.array([tran_y(y_test[i]) for i in range(len(y_test))])

model_vgg_mnist_pretrain.fit(x_train,y_train_ohe,validation_data=(x_test,y_test_ohe),epochs=200,batch_size=128)