Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

cdzhoubin/DotnetSpider

Open more actions menu
 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

DotnetSpider

=================

This is a cross platfrom, ligth spider develop by C#.

DESIGN

demo

BASE USAGE

	public static void Main()
	{
		HttpClientDownloader downloader = new HttpClientDownloader();

		Core.Spider spider = Core.Spider.Create(new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);
		var site = new Site() { EncodingName = "UTF-8" };
		for (int i = 1; i < 5; ++i)
		{
			site.AddStartUrl("http://www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_1.html");
		}
		spider.Site = site;
		spider.Start();
	}

	private class MyPipeline : IPipeline
	{
		public void Process(ResultItems resultItems, ISpider spider)
		{
			foreach (YoukuVideo entry in resultItems.Results["VideoResult"])
			{
				Console.WriteLine($"{entry.Name}:{entry.Click}");
			}

			//May be you want to save to database
			// 
		}

		public void Dispose()
		{
		}
	}

	private class MyPageProcessor : IPageProcessor
	{
		public void Process(Page page)
		{
			var totalVideoElements = page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-col3']")).Nodes();
			List<YoukuVideo> results = new List<YoukuVideo>();
			foreach (var videoElement in totalVideoElements)
			{
				var video = new YoukuVideo();
				video.Name = videoElement.Select(Selectors.XPath("/div[4]/div[1]/a")).Value;
				video.Click = int.Parse(videoElement.Select(Selectors.Css("p-num")).Value.ToString());
				results.Add(video);
			}
			page.AddResultItem("VideoResult", results);
		}

		public Site Site => new Site { SleepTime = 0 };
	}

	public class YoukuVideo
	{
		public string Name { get; set; }
		public string Click { get; set; }
	}

ADDITIONAL USAGE

    public class JdSkuSpider : SpiderBuilder
{
	protected override SpiderContext GetSpiderContext()
	{
		SpiderContext context = new SpiderContext();
		context.SetThreadNum(8);
		context.SetSpiderName("JD sku/store test " + DateTime.Now.ToString("yyyy-MM-dd HHmmss"));
		context.AddTargetUrlExtractor(new Extension.Configuration.TargetUrlExtractor
		{
			Region = new Extension.Configuration.Selector { Type = ExtractType.XPath, Expression = "//span[@class=\"p-num\"]" },
			Patterns = new List<string> { @"&page=[0-9]+&" }
		});
		context.AddPipeline(new MysqlPipeline
		{
			ConnectString = "Database='test';Data Source=86research.imwork.net;User ID=root;Password=1qazZAQ!;Port=4306"
		});
		context.AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手机" }, { "cat3", "655" } });
		context.AddEntityType(typeof(Product));
		
		return context;
	}

	[Schema("test", "sku", TableSuffix.Today)]
	[TypeExtractBy(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]", Multi = true)]
	[Indexes(Index = new[] { "category" }, Unique = new[] { "category,sku", "sku" })]
	public class Product : ISpiderEntity
	{
		[StoredAs("category", DataType.String, 20)]
		[PropertyExtractBy(Expression = "name", Type = ExtractType.Enviroment)]
		public string CategoryName { get; set; }

		[StoredAs("cat3", DataType.String, 20)]
		[PropertyExtractBy(Expression = "cat3", Type = ExtractType.Enviroment)]
		public int CategoryId { get; set; }

		[StoredAs("url", DataType.Text)]
		[PropertyExtractBy(Expression = "./div[1]/a/@href")]
		public string Url { get; set; }

		[StoredAs("sku", DataType.String, 25)]
		[PropertyExtractBy(Expression = "./@data-sku")]
		public string Sku { get; set; }

		[StoredAs("commentscount", DataType.String, 32)]
		[PropertyExtractBy(Expression = "./div[5]/strong/a")]
		public long CommentsCount { get; set; }

		[StoredAs("shopname", DataType.String, 100)]
		[PropertyExtractBy(Expression = ".//div[@class='p-shop']/@data-shop_name")]
		public string ShopName { get; set; }

		[StoredAs("name", DataType.String, 50)]
		[PropertyExtractBy(Expression = ".//div[@class='p-name']/a/em")]
		public string Name { get; set; }

		[StoredAs("venderid", DataType.String, 25)]
		[PropertyExtractBy(Expression = "./@venderid")]
		public string VenderId { get; set; }

		[StoredAs("jdzy_shop_id", DataType.String, 25)]
		[PropertyExtractBy(Expression = "./@jdzy_shop_id")]
		public string JdzyShopId { get; set; }

		[StoredAs("run_id", DataType.Date)]
		[PropertyExtractBy(Expression = "Monday", Type = ExtractType.Enviroment)]
		public DateTime RunId { get; set; }

		[PropertyExtractBy(Expression = "Now", Type = ExtractType.Enviroment)]
		[StoredAs("cdate", DataType.Time)]
		public DateTime CDate { get; set; }
	}
}

JdSkuSpider spider = new JdSkuSpider();
spider.Run();

NOTICE

  1. when you use redis scheduler, please update your redis config: timeout 30 tcp-keepalive 60

UPDATES

1.0.0.0-PRE

AREAS FOR IMPROVEMENTS

QQ: 477731655

About

donet spider

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages

  • C# 99.9%
  • Shell 0.1%
Morty Proxy This is a proxified and sanitized view of the page, visit original site.